In [3]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
df = pd.read_csv("train.csv")

In [33]:
print(df)

      COMMENT_ID             AUTHOR                     DATE  \
0              1      Brandon Pryor      2014-01-19 00:36:25   
1              2        Chelsea Yun  2015-05-23 07:17:09.691   
2              3  Sofia Aristizabal      2014-09-09 00:43:52   
3              4     said abdesalam  2015-05-24 07:35:13.754   
4              5         crazy girl  2015-05-23 23:26:05.305   
...          ...                ...                      ...   
1364        1365          Seth Ryan  2015-05-21 20:04:26.473   
1365        1366       sagar basnet      2014-11-07 15:31:25   
1366        1367       Sarcataclysm      2014-09-20 23:11:39   
1367        1368  liu hui (dukegod)  2015-05-21 05:12:43.987   
1368        1369     RusPassingGame  2015-05-25 11:04:16.985   

                                                CONTENT  \
0     I dont even watch it anymore i just come here ...   
1                                           i hate rap﻿   
2           I loved, she is amazing.. OMG your eyes*_*

In [45]:
df.isnull().sum()

COMMENT_ID      0
AUTHOR          0
DATE          170
CONTENT         0
VIDEO_NAME      0
CLASS           0
dtype: int64

We can see that the content and class column has 0 missing values. Only some dates missing which is pretty irrelevant. Let's remove irrelevant columns. 

In [7]:
df = df.drop(["DATE", "VIDEO_NAME"], axis = 1)

In [63]:
df.head()

Unnamed: 0,COMMENT_ID,AUTHOR,CONTENT,CLASS
0,1,Brandon Pryor,I dont even watch it anymore i just come here ...,0
1,2,Chelsea Yun,i hate rap﻿,0
2,3,Sofia Aristizabal,"I loved, she is amazing.. OMG your eyes*_*﻿",0
3,4,said abdesalam,song is bad﻿,0
4,5,crazy girl,tension⤴︎⤴︎﻿,0


Let's create our training and testing data. 20% of data for testing and 80% for training

In [9]:
X = df["CONTENT"]
Y = df["CLASS"]
print(X)

0       I dont even watch it anymore i just come here ...
1                                             i hate rap﻿
2             I loved, she is amazing.. OMG your eyes*_*﻿
3                                            song is bad﻿
4                                            tension⤴︎⤴︎﻿
                              ...                        
1364                                          Great song﻿
1365      The population of world is more than 7 billion﻿
1366    At least she didn't get rid of her completely ...
1367                                    i love this song﻿
1368                        Who is watching in 2015 like﻿
Name: CONTENT, Length: 1369, dtype: object


In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 3)


In [21]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

#let's make sure the labels for Y are in int form e.g 0, 1 and not any other like "0", "1"
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [162]:
print(X_train_features)
X_train_features.shape

  (0, 1573)	1.0
  (1, 871)	0.40278733616383167
  (1, 1711)	0.2514385472121896
  (1, 1835)	0.29365622422309184
  (1, 887)	0.4450050131747339
  (1, 466)	0.40278733616383167
  (1, 1589)	0.3605696591529295
  (1, 47)	0.4450050131747339
  (2, 2659)	0.720556436677882
  (2, 2762)	0.6933962947421
  (3, 908)	0.6662694226130323
  (3, 634)	0.5715708983600497
  (3, 2299)	0.2866538229602437
  (3, 492)	0.38369434504701716
  (4, 1519)	0.304962630114493
  (4, 635)	0.41008050372194854
  (4, 2689)	0.5001160597506003
  (4, 1917)	0.699082042023923
  (5, 1519)	0.5967417939218324
  (5, 635)	0.8024333189661016
  (6, 1711)	0.11370506094866846
  (6, 1835)	0.13279665844181177
  (6, 1189)	0.10621509462342904
  (6, 580)	0.19249015544899983
  (6, 962)	0.1407203873226042
  :	:
  (1092, 2635)	0.5995274064656667
  (1093, 2168)	0.4780979643735903
  (1093, 1445)	0.46196573830294096
  (1093, 2668)	0.5282092355762288
  (1093, 1446)	0.5282092355762288
  (1094, 624)	0.0297893337224674
  (1094, 2773)	0.037426386320677255
  (

(1095, 2821)

We created the TF-IDF matrix where each sentence represented as a vector with weighing given to certain words. 
Notice how there are 2821 columns meaning there are 2821 features (DAMN). 

In [23]:
model = LogisticRegression() #create an instance of a logistic regression model I can train

In [25]:
model.fit(X_train_features, Y_train) #training logistic regression model on training data using MLE

In [27]:
#the next line takes the matrix of training data and spits out an array of predictions full of 
#0s and 1s, classifying each vector (representation of a sentence) as spam or non-spam
prediction_on_training_data = model.predict(X_train_features) 

#accuracy score = # of correct predictions / Total # of predictions 
#CALCULATING 1 - TRAINING ERROR RATE 
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [132]:
print(accuracy_on_training_data)

0.9753424657534246


Our predictions were correct 97.5% of the time using the logistic regression model we fitted. 

In [29]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [31]:
print(accuracy_on_test_data)

0.9671532846715328


Pretty good accuracy on test data as well. 

In [61]:
input_your_comment = [""]
input_data_features = feature_extraction.transform(input_your_comment)
prediction = model.predict(input_data_features)
print(prediction)


[0]


In [187]:
df2 = pd.read_csv("test.csv")

In [189]:
df2.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,VIDEO_NAME
0,1370,tyler sleetway,2013-10-05 00:57:25.078,so beutiful,Shakira - Waka Waka
1,1371,Young Hittaz,2014-01-19 04:21:11,everyone please come check our newest song in ...,PSY - GANGNAM STYLE(?????) M/V
2,1372,WeMuckAround,2014-08-16 20:59:28,katy perry will u sit on my face please. it wo...,Katy Perry - Roar
3,1373,DanteBTV,,Check Out The New Hot Video By Dante B Called ...,Eminem - Love The Way You Lie ft. Rihanna
4,1374,Sheila Cenabre,2014-08-19 12:33:11,I really love this video.. http://www.bubblews...,Katy Perry - Roar


In [199]:
X1 = df2["CONTENT"]
#No Y labels. How can I compare this to true labels? idk
transform_X1 = feature_extraction.transform(X1)
new_prediction = model.predict(transform_X1)