In [158]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [159]:
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
comments_test = df_test["CONTENT"]
print(comments_test)

0                                            so beutiful
1      everyone please come check our newest song in ...
2      katy perry will u sit on my face please. it wo...
3      Check Out The New Hot Video By Dante B Called ...
4      I really love this video.. http://www.bubblews...
                             ...                        
582                 please subscribe to my page. thanks.
583                                     I love this song
584                                   Eminem THE BEST !﻿
585                                     beautiful song!﻿
586             why I dont see any comments but mine?:/﻿
Name: CONTENT, Length: 587, dtype: object


In [160]:
print(df)

      COMMENT_ID             AUTHOR                     DATE  \
0              1      Brandon Pryor      2014-01-19 00:36:25   
1              2        Chelsea Yun  2015-05-23 07:17:09.691   
2              3  Sofia Aristizabal      2014-09-09 00:43:52   
3              4     said abdesalam  2015-05-24 07:35:13.754   
4              5         crazy girl  2015-05-23 23:26:05.305   
...          ...                ...                      ...   
1364        1365          Seth Ryan  2015-05-21 20:04:26.473   
1365        1366       sagar basnet      2014-11-07 15:31:25   
1366        1367       Sarcataclysm      2014-09-20 23:11:39   
1367        1368  liu hui (dukegod)  2015-05-21 05:12:43.987   
1368        1369     RusPassingGame  2015-05-25 11:04:16.985   

                                                CONTENT  \
0     I dont even watch it anymore i just come here ...   
1                                           i hate rap﻿   
2           I loved, she is amazing.. OMG your eyes*_*

In [161]:
print(df_test)
df_test.head()

     COMMENT_ID          AUTHOR                     DATE  \
0          1370  tyler sleetway  2013-10-05 00:57:25.078   
1          1371    Young Hittaz      2014-01-19 04:21:11   
2          1372    WeMuckAround      2014-08-16 20:59:28   
3          1373        DanteBTV                      NaN   
4          1374  Sheila Cenabre      2014-08-19 12:33:11   
..          ...             ...                      ...   
582        1952    Stanša Matej  2013-08-13 19:18:54.893   
583        1953       Alaa Foad  2013-09-06 18:32:07.187   
584        1954        Mortal 2  2015-05-20 16:32:56.261   
585        1955  Giovanni Satta  2015-05-20 09:10:44.658   
586        1956       mindaugux      2014-11-08 00:36:31   

                                               CONTENT  \
0                                          so beutiful   
1    everyone please come check our newest song in ...   
2    katy perry will u sit on my face please. it wo...   
3    Check Out The New Hot Video By Dante B Cal

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,VIDEO_NAME
0,1370,tyler sleetway,2013-10-05 00:57:25.078,so beutiful,Shakira - Waka Waka
1,1371,Young Hittaz,2014-01-19 04:21:11,everyone please come check our newest song in ...,PSY - GANGNAM STYLE(?????) M/V
2,1372,WeMuckAround,2014-08-16 20:59:28,katy perry will u sit on my face please. it wo...,Katy Perry - Roar
3,1373,DanteBTV,,Check Out The New Hot Video By Dante B Called ...,Eminem - Love The Way You Lie ft. Rihanna
4,1374,Sheila Cenabre,2014-08-19 12:33:11,I really love this video.. http://www.bubblews...,Katy Perry - Roar


In [162]:
df.isnull().sum()

COMMENT_ID      0
AUTHOR          0
DATE          170
CONTENT         0
VIDEO_NAME      0
CLASS           0
dtype: int64

We can see that the content and class column has 0 missing values. Only some dates missing which is pretty irrelevant. Let's remove irrelevant columns. 

In [164]:
df = df.drop(["DATE", "VIDEO_NAME"], axis = 1)

In [165]:
df.head(10)

Unnamed: 0,COMMENT_ID,AUTHOR,CONTENT,CLASS
0,1,Brandon Pryor,I dont even watch it anymore i just come here ...,0
1,2,Chelsea Yun,i hate rap﻿,0
2,3,Sofia Aristizabal,"I loved, she is amazing.. OMG your eyes*_*﻿",0
3,4,said abdesalam,song is bad﻿,0
4,5,crazy girl,tension⤴︎⤴︎﻿,0
5,6,Bob Orton,love this song﻿,0
6,7,LEGO_01 AND OBRY,"Holy crap. 800,000,000 views?!﻿",0
7,8,Dakota Taylor,Cool﻿,0
8,9,Callum Hudson,Best for partying ﻿,0
9,10,Shakira Forever,Shakira﻿,0


Let's create our training and testing data. 20% of data for testing and 80% for training

In [167]:
X = df["CONTENT"]
Y = df["CLASS"]
print(X)

0       I dont even watch it anymore i just come here ...
1                                             i hate rap﻿
2             I loved, she is amazing.. OMG your eyes*_*﻿
3                                            song is bad﻿
4                                            tension⤴︎⤴︎﻿
                              ...                        
1364                                          Great song﻿
1365      The population of world is more than 7 billion﻿
1366    At least she didn't get rid of her completely ...
1367                                    i love this song﻿
1368                        Who is watching in 2015 like﻿
Name: CONTENT, Length: 1369, dtype: object


In [168]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 3)


In [169]:
feature_extraction = TfidfVectorizer(min_df = 2, stop_words = 'english', lowercase = True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

#let's make sure the labels for Y are in int form e.g 0, 1 and not any other like "0", "1"
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [170]:
test_data_X = feature_extraction.transform(comments_test)

In [171]:
X_train_features.shape

(1095, 884)

In [172]:
print(X_train_features)
X_train_features.shape

  (0, 456)	1.0
  (1, 210)	0.5182967949860864
  (1, 505)	0.32354491180683187
  (1, 537)	0.3778695757719579
  (1, 94)	0.5182967949860864
  (1, 462)	0.46397213102096047
  (2, 820)	0.720556436677882
  (2, 861)	0.6933962947421
  (3, 222)	0.6662694226130323
  (3, 144)	0.5715708983600497
  (3, 697)	0.2866538229602437
  (3, 103)	0.38369434504701716
  (4, 434)	0.4264964264172703
  (4, 145)	0.5735059056748782
  (4, 833)	0.6994224577530634
  (5, 434)	0.5967417939218324
  (5, 145)	0.8024333189661016
  (6, 505)	0.13829275095766727
  (6, 537)	0.16151273356420412
  (6, 322)	0.1291831472245031
  (6, 125)	0.23411440886812107
  (6, 235)	0.1711498970785194
  (6, 838)	0.16442755846277454
  (6, 110)	0.1488736206451061
  (6, 883)	0.5831096390551688
  :	:
  (1088, 514)	0.4554201814775826
  (1088, 754)	0.7048507133391682
  (1089, 87)	0.35423850740825885
  (1089, 460)	0.41185316300718056
  (1089, 283)	0.492071051804957
  (1089, 220)	0.400753983188702
  (1089, 89)	0.5496857074038787
  (1090, 125)	0.492699506129

(1095, 884)

We created the TF-IDF matrix where each sentence represented as a vector with weighing given to certain words. There were 2821 columns or features initially, which was reduced to 884 columns with min_df = 2 as an argument, found from cross-validation using gridsearchCV 

In [174]:
model = LogisticRegression() #create an instance of a logistic regression model I can train

In [175]:
model.fit(X_train_features, Y_train) #training logistic regression model on training data using MLE

In [176]:
#the next line takes the matrix of training data and spits out an array of predictions full of 
#0s and 1s, classifying each vector (representation of a sentence) as spam or non-spam
prediction_on_training_data = model.predict(X_train_features) 

#accuracy score = # of correct predictions / Total # of predictions 
#CALCULATING 1 - TRAINING ERROR RATE 
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)


In [177]:
test_data_prediction = model.predict(test_data_X) 
print(test_data_prediction)
test_data_prediction.shape
df_test["CLASS"] = test_data_prediction
df_test.head()



[0 1 0 1 1 1 1 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 1 0 1 0
 0 0 1 0 0 1 1 1 1 0 1 1 0 1 0 1 0 0 1 1 0 1 0 1 1 1 1 0 0 1 1 0 0 1 0 0 1
 1 1 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0 1 1 1 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 1 1 0
 0 0 1 0 0 0 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 0 1 1 1 0 0 1 1 0 0 1 0 1 0 0 0
 0 1 1 0 0 1 1 1 1 0 0 0 1 1 1 1 0 1 0 0 0 1 0 1 0 1 1 0 1 0 1 0 1 1 1 0 0
 1 0 1 0 1 0 1 0 0 1 1 0 1 0 0 0 1 0 1 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 0
 0 0 0 1 1 0 1 0 1 0 0 1 1 1 1 1 1 1 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 1 1 1 1
 1 1 1 1 1 0 1 0 0 1 0 1 0 1 1 1 0 1 1 0 0 0 1 0 1 0 1 1 1 0 1 0 0 0 0 1 1
 0 0 1 1 1 0 0 1 1 0 1 1 0 1 1 1 1 1 0 1 0 0 0 0 1 1 0 1 0 1 1 0 0 0 1 0 0
 1 0 0 1 0 0 0 1 1 1 0 1 1 1 1 0 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 1
 0 1 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 1 1 1 1 0 0 1 1 0 0 0 1 1 0 1 1 0 1 0 0
 0 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 0 1 0 1 1 0 1 1 0 1 1 0
 0 0 1 0 0 1 0 0 0 1 0 1 

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,VIDEO_NAME,CLASS
0,1370,tyler sleetway,2013-10-05 00:57:25.078,so beutiful,Shakira - Waka Waka,0
1,1371,Young Hittaz,2014-01-19 04:21:11,everyone please come check our newest song in ...,PSY - GANGNAM STYLE(?????) M/V,1
2,1372,WeMuckAround,2014-08-16 20:59:28,katy perry will u sit on my face please. it wo...,Katy Perry - Roar,0
3,1373,DanteBTV,,Check Out The New Hot Video By Dante B Called ...,Eminem - Love The Way You Lie ft. Rihanna,1
4,1374,Sheila Cenabre,2014-08-19 12:33:11,I really love this video.. http://www.bubblews...,Katy Perry - Roar,1


In [178]:
# Drop every column except for commentID and Class
df_test = df_test.drop(["AUTHOR", "DATE", "CONTENT", "VIDEO_NAME"], axis = 1)
df_test.head()

Unnamed: 0,COMMENT_ID,CLASS
0,1370,0
1,1371,1
2,1372,0
3,1373,1
4,1374,1


In [179]:
#Store df_test in a .csv file
df_test.to_csv("classified_result.csv", index=False)

In [180]:
# print(X_train_features)
prediction_on_training_data

array([0, 1, 0, ..., 1, 0, 1])

In [181]:
print(accuracy_on_training_data)

0.965296803652968


Our predictions were correct 96.5% of the time using the logistic regression model we fitted. 

In [183]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [184]:
print(accuracy_on_test_data)

0.9598540145985401


Pretty good accuracy on test data as well. 

In [186]:
input_your_comment = [""]
input_data_features = feature_extraction.transform(input_your_comment)
prediction = model.predict(input_data_features)
print(prediction)


[0]


Now let's use hyperparameter tuning to make our estimates better 

In [188]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

In [189]:
pipeline_tfidf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', lowercase=True)),
    ('logreg', LogisticRegression(solver='liblinear', max_iter=1000))  # Use default logistic regression params here
])


In [190]:
param_grid_tfidf = {
    'tfidf__min_df': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],                # Test different min_df values
    'tfidf__max_df': [0.7, 0.8, 0.9, 1.0]     # Test different max_df values
}

In [191]:
grid_search_tfidf = GridSearchCV(pipeline_tfidf, param_grid_tfidf, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search_tfidf.fit(X_train, Y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


In [192]:
best_tfidf_params = grid_search_tfidf.best_params_
print("Best TfidfVectorizer Parameters: ", best_tfidf_params)

Best TfidfVectorizer Parameters:  {'tfidf__max_df': 0.7, 'tfidf__min_df': 2}


Next, Let's use this same gridsearch method to find the best training-testing split using cross-validation

In [194]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Original Data Preparation
X = df["CONTENT"]
Y = df["CLASS"].astype("int")  # Ensure Y labels are integers

# Function to test multiple train-test splits
def test_train_test_split(X, Y, splits=[0.2, 0.3, 0.1]):
    results = {}
    for split in splits:
        X_train, X_test, Y_train, Y_test = train_test_split(
            X, Y, test_size=split, random_state=3
        )
        # Vectorizing
        vectorizer = TfidfVectorizer(min_df=2, stop_words="english", lowercase=True)
        X_train_features = vectorizer.fit_transform(X_train)
        X_test_features = vectorizer.transform(X_test)
        
        # Model Training
        model = LogisticRegression()
        model.fit(X_train_features, Y_train)
        
        # Prediction and Accuracy
        Y_pred = model.predict(X_test_features)
        accuracy = accuracy_score(Y_test, Y_pred)
        results[split] = accuracy
    
    return results

# Test different splits
splits_to_test = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3]
split_results = test_train_test_split(X, Y, splits=splits_to_test)
print("Accuracy for different train-test splits:", split_results)

Accuracy for different train-test splits: {0.05: 0.9420289855072463, 0.1: 0.9343065693430657, 0.15: 0.9563106796116505, 0.2: 0.9598540145985401, 0.25: 0.967930029154519, 0.3: 0.9586374695863747}


We can see that the higest testing accuracy is achieved by a split of 25% testing data and 75% training data. However when tested on the kaggle test dataset, it yielded a lower score than the 20% test and 80% training split so we keep the original. 

Our gridsearch result above gives us the best tuning parameters for our tf-idf vectorizer. Min_df 2 means we do not select words as features that appear in less than 2 sentences. Max_df = 0.7 means we ignore words that appear in more than 70% of the sentences. Our initial testing was done with the default values of min_df = max_df = 1 