<a href="https://colab.research.google.com/github/sravanneeli/Colab/blob/main/ZS_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import re

import tensorflow as tf
import tensorflow_hub as hub

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression

In [3]:
train_df = pd.read_csv('/content/drive/MyDrive/ZS/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/ZS/test.csv')

In [4]:
def clean_text(text):
  text = re.sub(r'(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+', 'url', text)
  text = re.sub(r'[^A-Za-z ]', ' ', text)
  text = " ".join(text.split())
  return text.lower()

In [5]:
train_df['cleaned'] = train_df['CONTENT'].apply(clean_text)
test_df['cleaned']= test_df['CONTENT'].apply(clean_text)

In [6]:
def lr_model():
  return LogisticRegression()

In [11]:
def train_ml_model(X, y, X_test):
  kf = KFold(random_state=5, n_splits=10, shuffle=True)
  scores = []
  y_pred_l = []
  iteration = 1
  for train_idx, val_idx in kf.split(X):
    X_train, X_val, y_train, y_val = X[train_idx], X[val_idx], y[train_idx], y[val_idx]
    model = lr_model()
    model.fit(X_train.toarray(), y_train)
    y_pred_val = model.predict(X_val.toarray())
    r_a_score = roc_auc_score(y_val, y_pred_val)
    scores.append(r_a_score)
    print(f"Iteration: {iteration} ROC-AUC Score - {r_a_score}")
    y_pred_l.append(model.predict_proba(X_test.toarray()))
    iteration += 1

  print(f"Number of features: {X.shape[1]}, Mean ROC-AUC Score: {np.mean(scores)}")
  return np.mean(y_pred_l, axis=0)

# Count Vectorizer 

In [8]:
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english', min_df=2)
X_train = vectorizer.fit_transform(train_df['cleaned'])
X_test = vectorizer.transform(test_df['cleaned'])

## Logistic Regression Model

In [12]:
y_pred = train_ml_model(X_train, train_df['CLASS'], X_test)

Iteration: 1 ROC-AUC Score - 0.9161425576519916
Iteration: 2 ROC-AUC Score - 0.9131727624145108
Iteration: 3 ROC-AUC Score - 0.9032451923076923
Iteration: 4 ROC-AUC Score - 0.896223609872138
Iteration: 5 ROC-AUC Score - 0.9050480769230769
Iteration: 6 ROC-AUC Score - 0.9049955396966993
Iteration: 7 ROC-AUC Score - 0.8982142857142857
Iteration: 8 ROC-AUC Score - 0.9001865671641791
Iteration: 9 ROC-AUC Score - 0.9732847601700061
Iteration: 10 ROC-AUC Score - 0.9155982905982906
Number of features: 1933, Mean ROC-AUC Score: 0.9126111642512871


In [14]:
test_df['CLASS'] = np.argmax(y_pred, axis=1)
test_df[['ID', 'CLASS']].to_csv('/content/drive/MyDrive/ZS/3.csv', index=False)

# TFIDF

In [15]:
tfidf = TfidfVectorizer(ngram_range=(1, 3), stop_words='english', min_df=2)
X_train = tfidf.fit_transform(train_df['cleaned'])
X_test = tfidf.transform(test_df['cleaned'])

In [16]:
y_pred = train_ml_model(X_train, train_df['CLASS'], X_test)

Iteration: 1 ROC-AUC Score - 0.9097035040431267
Iteration: 2 ROC-AUC Score - 0.9304192685102587
Iteration: 3 ROC-AUC Score - 0.9278846153846154
Iteration: 4 ROC-AUC Score - 0.9046981861433245
Iteration: 5 ROC-AUC Score - 0.8816105769230769
Iteration: 6 ROC-AUC Score - 0.8971156705322628
Iteration: 7 ROC-AUC Score - 0.8738095238095237
Iteration: 8 ROC-AUC Score - 0.9106032338308458
Iteration: 9 ROC-AUC Score - 0.9650880388585308
Iteration: 10 ROC-AUC Score - 0.9172771672771673
Number of features: 1933, Mean ROC-AUC Score: 0.9118209785312732


In [17]:
test_df['CLASS'] = np.argmax(y_pred, axis=1)
test_df[['ID', 'CLASS']].to_csv('6.csv', index=False)

# Embedding Method

In [18]:
tf.random.set_seed(42)

In [19]:
embedding = "https://tfhub.dev/google/nnlm-en-dim128/2"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)

In [20]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 128)               124642688 
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 124,646,849
Trainable params: 124,646,849
Non-trainable params: 0
_________________________________________________________________


In [21]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy', tf.keras.metrics.AUC()])

In [22]:
model.fit(train_df['cleaned'], train_df['CLASS'], batch_size=16, epochs=5, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f0a66705e90>

In [23]:
y_pred = model.predict(test_df['cleaned']) > 0.5
test_df['CLASS'] = y_pred
test_df['CLASS'] = test_df['CLASS'].apply(int)

In [24]:
test_df[['ID', 'CLASS']].to_csv('/content/drive/MyDrive/ZS/9.csv', index=False)