In [2]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import nltk
from nltk.corpus import stopwords
import re
import spacy
nlp=spacy.load('en_core_web_sm')



In [None]:
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
comments_test = df_test["CONTENT"]
print(comments_test)

In [None]:
print(df)

In [None]:
print(df_test)
df_test.head()

In [None]:
df.isnull().sum()

We can see that the content and class column has 0 missing values. Only some dates missing which is pretty irrelevant. Let's remove irrelevant columns. 

In [113]:
df = df.drop(["DATE", "VIDEO_NAME"], axis = 1)

In [None]:
df.head(10)

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/svishwa4/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
stop_words = stopwords.words('english')
print(f" Stop words used: \n", stop_words)

 Stop words used: 
 ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'sa

In [5]:
# Clean the dataset by removing stop words
df_clean = df
print(f" data frame before cleaning: \n", df.head(10))

stop_words = stopwords.words('english')
print(f" Stop words used: \n", stop_words)

def clean_text(text):
  text=text.lower()
  text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
  text = re.sub(r'\@\w+|\#','', text)
  text = re.sub(r'[^\x00-\x7F]+','', text)
  text=re.sub(r'[^a-zA-Z0-9\s]','',text)
  text=re.sub(r'\s+https\S+',' ',text)
  text=re.sub(r'\s+www\S+',' ',text)
  text=re.sub(r'\s+http\S+',' ',text)
  text=re.sub(r'\s+',' ',text)
  text=re.sub(r'\d+','',text)
  text=text.strip()
  words = nlp(text)
  words = [word.lemma_ for word in words]
  words = [word for word in words if word.isalpha() and word not in stop_words and len(word) > 2]
  return ' '.join(words)


df_clean['CONTENT']=df_clean['CONTENT'].apply(clean_text)
print(f" data frame after cleaning: \n", df_clean.head(10))

# df = df_clean

NameError: name 'df' is not defined

Let's create our training and testing data. 20% of data for testing and 80% for training

In [114]:
X = df["CONTENT"]
Y = df["CLASS"]
print(X)

0       I dont even watch it anymore i just come here ...
1                                             i hate rap﻿
2             I loved, she is amazing.. OMG your eyes*_*﻿
3                                            song is bad﻿
4                                            tension⤴︎⤴︎﻿
                              ...                        
1364                                          Great song﻿
1365      The population of world is more than 7 billion﻿
1366    At least she didn't get rid of her completely ...
1367                                    i love this song﻿
1368                        Who is watching in 2015 like﻿
Name: CONTENT, Length: 1369, dtype: object


In [115]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 3)


In [116]:
# Change the min_df from 1 to 5, i.e, if a word appears in less than 5 sentences, drop it. 
feature_extraction = TfidfVectorizer(min_df = 2, stop_words = 'english', lowercase = True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

#let's make sure the labels for Y are in int form e.g 0, 1 and not any other like "0", "1"
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [117]:
test_data_X = feature_extraction.transform(comments_test)

In [118]:
X_train_features.shape

(1095, 884)

In [119]:
print(X_train_features)
X_train_features.shape

  (np.int32(0), np.int32(456))	1.0
  (np.int32(1), np.int32(210))	0.5182967949860864
  (np.int32(1), np.int32(505))	0.32354491180683187
  (np.int32(1), np.int32(537))	0.3778695757719579
  (np.int32(1), np.int32(94))	0.5182967949860864
  (np.int32(1), np.int32(462))	0.46397213102096047
  (np.int32(2), np.int32(820))	0.7205564366778823
  (np.int32(2), np.int32(861))	0.6933962947421001
  (np.int32(3), np.int32(222))	0.6662694226130323
  (np.int32(3), np.int32(144))	0.5715708983600497
  (np.int32(3), np.int32(697))	0.2866538229602437
  (np.int32(3), np.int32(103))	0.38369434504701716
  (np.int32(4), np.int32(434))	0.4264964264172703
  (np.int32(4), np.int32(145))	0.5735059056748782
  (np.int32(4), np.int32(833))	0.6994224577530634
  (np.int32(5), np.int32(434))	0.5967417939218324
  (np.int32(5), np.int32(145))	0.8024333189661016
  (np.int32(6), np.int32(505))	0.13829275095766727
  (np.int32(6), np.int32(537))	0.16151273356420412
  (np.int32(6), np.int32(322))	0.1291831472245031
  (np.int32

(1095, 884)

We created the TF-IDF matrix where each sentence represented as a vector with weighing given to certain words. 
Notice how there are 2821 columns meaning there are 2821 features (DAMN). 

In [123]:
model = LogisticRegression() #create an instance of a logistic regression model I can train

In [124]:
# Experiment with SVC model

from sklearn import svm
svmModel = svm.SVC()
model.fit(X_train_features, Y_train)
prediction_on_training_data = model.predict(X_train_features) 

#accuracy score = # of correct predictions / Total # of predictions 
#CALCULATING 1 - TRAINING ERROR RATE 
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)
print(accuracy_on_training_data)

# Test accuracy
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print(accuracy_on_test_data)

0.965296803652968
0.9598540145985401


In [121]:
model.fit(X_train_features, Y_train) #training logistic regression model on training data using MLE

In [122]:
#the next line takes the matrix of training data and spits out an array of predictions full of 
#0s and 1s, classifying each vector (representation of a sentence) as spam or non-spam
prediction_on_training_data = model.predict(X_train_features) 

#accuracy score = # of correct predictions / Total # of predictions 
#CALCULATING 1 - TRAINING ERROR RATE 
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)
print(accuracy_on_training_data)

0.965296803652968


In [58]:
test_data_prediction = model.predict(test_data_X) 
print(test_data_prediction)
test_data_prediction.shape
df_test["CLASS"] = test_data_prediction
df_test.head()


[0 1 0 1 0 1 1 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 1 1 0 1 1 1 0
 0 0 1 0 0 1 1 1 1 0 1 1 0 1 0 1 0 0 0 1 0 0 0 1 1 1 1 0 0 0 1 0 0 1 0 0 1
 1 1 0 1 0 1 1 0 0 1 0 1 0 0 1 0 0 0 1 1 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1 1 1 0 1 0 1 0 1 1 1 1 0 0 1 0 1 1 0 1 1 0
 0 0 1 0 0 0 1 1 0 0 0 0 1 0 1 0 1 0 1 1 1 0 1 1 0 0 0 1 1 0 0 1 0 1 0 0 0
 0 1 1 0 0 1 1 1 1 0 0 0 1 1 1 1 0 1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 1 1 1 0 0
 1 1 1 0 0 0 1 0 0 1 1 0 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 1 0 0 1 1 0 0 0 0 0
 0 0 0 1 1 0 1 0 1 0 0 1 1 1 1 1 1 1 0 0 1 1 0 0 0 0 1 1 0 1 0 0 0 1 1 1 1
 1 1 1 1 1 1 1 0 0 1 0 1 0 1 1 1 0 1 1 0 0 0 1 0 1 0 1 1 1 0 1 0 0 1 1 1 1
 0 0 0 1 1 0 0 1 1 0 1 1 0 1 1 1 1 1 0 0 0 0 0 1 1 1 0 1 0 0 1 0 0 0 1 0 0
 1 0 1 1 1 0 0 1 1 1 1 1 1 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 1
 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 1 0 0 1 1 0 1 1 0 1 0 0
 0 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 0 1 0 1 1 0 0 0 1 0 1 

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,VIDEO_NAME,CLASS
0,1370,tyler sleetway,2013-10-05 00:57:25.078,so beutiful,Shakira - Waka Waka,0
1,1371,Young Hittaz,2014-01-19 04:21:11,everyone please come check our newest song in ...,PSY - GANGNAM STYLE(?????) M/V,1
2,1372,WeMuckAround,2014-08-16 20:59:28,katy perry will u sit on my face please. it wo...,Katy Perry - Roar,0
3,1373,DanteBTV,,Check Out The New Hot Video By Dante B Called ...,Eminem - Love The Way You Lie ft. Rihanna,1
4,1374,Sheila Cenabre,2014-08-19 12:33:11,I really love this video.. http://www.bubblews...,Katy Perry - Roar,0


In [59]:
# Drop every column except for commentID and Class
df_test = df_test.drop(["AUTHOR", "DATE", "CONTENT", "VIDEO_NAME"], axis = 1)
df_test.head()

Unnamed: 0,COMMENT_ID,CLASS
0,1370,0
1,1371,1
2,1372,0
3,1373,1
4,1374,0


In [60]:
#Store df_test in a .csv file
df_test.to_csv("classified_result_model2.csv", index=False)

In [61]:
# print(X_train_features)
prediction_on_training_data

array([0, 1, 0, ..., 1, 0, 0])

In [62]:
print(accuracy_on_training_data)

0.9260273972602739


Our predictions were correct 97.5% of the time using the logistic regression model we fitted. 

In [63]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print(accuracy_on_test_data)

0.9233576642335767


Pretty good accuracy on test data as well. 

In [64]:
input_your_comment = [""]
input_data_features = feature_extraction.transform(input_your_comment)
prediction = model.predict(input_data_features)
print(prediction)


[0]


In [65]:
df2 = pd.read_csv("test.csv")

In [66]:
df2.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,VIDEO_NAME
0,1370,tyler sleetway,2013-10-05 00:57:25.078,so beutiful,Shakira - Waka Waka
1,1371,Young Hittaz,2014-01-19 04:21:11,everyone please come check our newest song in ...,PSY - GANGNAM STYLE(?????) M/V
2,1372,WeMuckAround,2014-08-16 20:59:28,katy perry will u sit on my face please. it wo...,Katy Perry - Roar
3,1373,DanteBTV,,Check Out The New Hot Video By Dante B Called ...,Eminem - Love The Way You Lie ft. Rihanna
4,1374,Sheila Cenabre,2014-08-19 12:33:11,I really love this video.. http://www.bubblews...,Katy Perry - Roar


In [67]:
X1 = df2["CONTENT"]
#No Y labels. How can I compare this to true labels? idk
transform_X1 = feature_extraction.transform(X1)
new_prediction = model.predict(transform_X1)

In [68]:
# Now that we have a model above that works, we will look at grid-search CV methods to search for the best model using hyperparameter tuning

# Consturct a parameter grid 
param_grid = [
    {'penalty':['l1','l2','elasticnet','none'],
    'C' : np.logspace(-4,4,20),
    'solver': ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter'  : [100,1000,2500,5000]
}
]


In [69]:

# Load grid-search-cv frok sklearn and fit the model 
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(model,param_grid = param_grid, cv = 3, verbose=True,n_jobs=-1)

best_clf = clf.fit(X_train_features, Y_train)
print(best_clf.best_estimator_)

Fitting 3 folds for each of 1600 candidates, totalling 4800 fits




LogisticRegression(C=np.float64(4.281332398719396), penalty='l1',
                   solver='liblinear')


3120 fits failed out of a total of 4800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/svishwa4/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/svishwa4/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/svishwa4/Library/Python/3.9/lib/python/site-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/svishwa4/Library/Python/3.9/lib/py

In [70]:
print(best_clf.best_estimator_)
print(best_clf.best_estimator_.get_params())


LogisticRegression(C=np.float64(4.281332398719396), penalty='l1',
                   solver='liblinear')
{'C': np.float64(4.281332398719396), 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l1', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [71]:
# Checking the accuracy score on the best fit model
print(f'Accuracy after tuninig parameters - : {best_clf.score(X_train_features, Y_train):.3f}')
# accuracy_score(Y_train, prediction_on_training_data)

Accuracy after tuninig parameters - : 0.947


In [72]:
clf_prediction_on_training_data = clf.predict(X_train_features) 

#accuracy score = # of correct predictions / Total # of predictions 
#CALCULATING 1 - TRAINING ERROR RATE 
clf_accuracy_on_training_data = accuracy_score(Y_train, clf_prediction_on_training_data)
print(clf_accuracy_on_training_data)

0.9470319634703196


In [73]:
clf_prediction_on_train_test_data = clf.predict(X_test_features)
clf_accuracy_on_test_data = accuracy_score(Y_test, clf_prediction_on_train_test_data)
print(clf_accuracy_on_test_data)

0.927007299270073


In [74]:
# Run the clf_prediction on the test data set and store its values in a new csv file
clf_prediction_on_test_test_data = clf.predict(test_data_X)
print(clf_prediction_on_test_test_data)
df_test["CLASS"] = test_data_prediction
print(df_test.head())
#Store df_test in a .csv file
df_test.to_csv("classified_result_model3.csv", index=False)

[0 1 1 1 0 1 1 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0
 0 0 1 0 0 1 1 1 1 0 1 1 0 1 0 1 0 0 0 1 0 0 0 1 1 1 1 0 0 0 1 0 0 1 0 0 1
 1 1 0 1 0 1 1 0 0 1 0 1 0 0 1 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 1 1 0 1 0 1 0 1 1 1 1 0 0 1 0 1 0 1 1 1 0
 0 0 1 0 0 0 1 1 1 0 0 0 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 0 0
 0 1 1 0 0 1 1 0 1 0 0 0 1 1 1 0 0 1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 1 1 1 0 0
 1 0 1 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 0
 0 0 0 1 1 0 1 0 1 0 0 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 1 0 1 1
 1 1 1 1 1 1 1 0 0 1 0 1 0 1 1 1 0 1 1 0 0 0 1 0 1 0 1 1 1 0 1 0 0 0 1 1 0
 0 0 0 1 1 0 0 1 1 0 1 1 0 0 1 1 0 1 0 0 0 0 0 1 1 1 0 1 0 0 1 0 0 0 1 0 0
 0 0 1 1 1 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 1
 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 1 0 0 0 1 0 1 1 0 1 0 0
 0 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 1 1 0 1 0 0
 0 0 1 0 1 1 0 0 0 1 0 0 