<a href="https://colab.research.google.com/github/sravanneeli/Colab/blob/main/ZS_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import numpy as np
import pandas as pd
import re

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/ZS/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/ZS/test.csv')

In [None]:
def clean_text(text):
  text = re.sub(r'(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+', 'url', text)
  text = re.sub(r'[^A-Za-z ]', ' ', text)
  text = " ".join(text.split())
  return text.lower()

In [None]:
train_df['cleaned'] = train_df['CONTENT'].apply(clean_text)
test_df['cleaned']= test_df['CONTENT'].apply(clean_text)

In [None]:
def train_ml_model(model, X, y, X_test):
  kf = KFold(random_state=5, n_splits=10, shuffle=True)
  scores = []
  y_pred_l = []
  iteration = 1
  for train_idx, val_idx in kf.split(X):
    X_train, X_val, y_train, y_val = X[train_idx], X[val_idx], y[train_idx], y[val_idx]
    model.fit(X_train.toarray(), y_train)
    y_pred_val = model.predict(X_val.toarray())
    r_a_score = roc_auc_score(y_val, y_pred_val)
    scores.append(r_a_score)
    print(f"Iteration: {iteration} ROC-AUC Score - {r_a_score}")
    y_pred_l.append(model.predict_proba(X_test.toarray()))
    iteration += 1

  print(f"Number of features: {X.shape[1]}, Mean ROC-AUC Score: {np.mean(scores)}")
  return np.mean(y_pred_l, axis=0)

# Count Vectorizer 

In [None]:
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english', min_df=2)
X_train = vectorizer.fit_transform(train_df['cleaned'])
X_test = vectorizer.transform(test_df['cleaned'])

## Logistic Regression Model

In [None]:
lr = LogisticRegression()
y_pred = train_ml_model(lr, X_train, train_df['CLASS'], X_test)

Iteration: 1 ROC-AUC Score - 0.9161425576519916
Iteration: 2 ROC-AUC Score - 0.9131727624145108
Iteration: 3 ROC-AUC Score - 0.9032451923076923
Iteration: 4 ROC-AUC Score - 0.896223609872138
Iteration: 5 ROC-AUC Score - 0.9050480769230769
Iteration: 6 ROC-AUC Score - 0.9049955396966993
Iteration: 7 ROC-AUC Score - 0.8982142857142857
Iteration: 8 ROC-AUC Score - 0.9001865671641791
Iteration: 9 ROC-AUC Score - 0.9732847601700061
Iteration: 10 ROC-AUC Score - 0.9155982905982906
Number of features: 1933, Mean ROC-AUC Score: 0.9126111642512871


In [None]:
test_df['CLASS'] = np.argmax(y_pred, axis=1)
test_df[['ID', 'CLASS']].to_csv('3.csv', index=False)

# TFIDF

In [57]:
tfidf = TfidfVectorizer(ngram_range=(1, 3), stop_words='english', min_df=2)
X_train = tfidf.fit_transform(train_df['cleaned'])
X_test = tfidf.transform(test_df['cleaned'])

In [58]:
lr = LogisticRegression()
y_pred = train_ml_model(lr, X_train, train_df['CLASS'], X_test)

Iteration: 1 ROC-AUC Score - 0.9097035040431267
Iteration: 2 ROC-AUC Score - 0.9304192685102587
Iteration: 3 ROC-AUC Score - 0.9278846153846154
Iteration: 4 ROC-AUC Score - 0.9046981861433245
Iteration: 5 ROC-AUC Score - 0.8816105769230769
Iteration: 6 ROC-AUC Score - 0.8971156705322628
Iteration: 7 ROC-AUC Score - 0.8738095238095237
Iteration: 8 ROC-AUC Score - 0.9106032338308458
Iteration: 9 ROC-AUC Score - 0.9650880388585308
Iteration: 10 ROC-AUC Score - 0.9172771672771673
Number of features: 1933, Mean ROC-AUC Score: 0.9118209785312732


In [59]:
test_df['CLASS'] = np.argmax(y_pred, axis=1)
test_df[['ID', 'CLASS']].to_csv('6.csv', index=False)