In [93]:
import numpy as np 
import pandas as pd
import re
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

Read the train data

In [94]:
df = pd.read_csv("train.csv")
print(df)

      COMMENT_ID             AUTHOR                     DATE  \
0              1      Brandon Pryor      2014-01-19 00:36:25   
1              2        Chelsea Yun  2015-05-23 07:17:09.691   
2              3  Sofia Aristizabal      2014-09-09 00:43:52   
3              4     said abdesalam  2015-05-24 07:35:13.754   
4              5         crazy girl  2015-05-23 23:26:05.305   
...          ...                ...                      ...   
1364        1365          Seth Ryan  2015-05-21 20:04:26.473   
1365        1366       sagar basnet      2014-11-07 15:31:25   
1366        1367       Sarcataclysm      2014-09-20 23:11:39   
1367        1368  liu hui (dukegod)  2015-05-21 05:12:43.987   
1368        1369     RusPassingGame  2015-05-25 11:04:16.985   

                                                CONTENT  \
0     I dont even watch it anymore i just come here ...   
1                                           i hate rap﻿   
2           I loved, she is amazing.. OMG your eyes*_*

Check the missing data and keep the data we'd work on

In [95]:
df.isnull().sum()

COMMENT_ID      0
AUTHOR          0
DATE          170
CONTENT         0
VIDEO_NAME      0
CLASS           0
dtype: int64

In [96]:
df = df.drop(["DATE", "VIDEO_NAME"], axis = 1)

In [97]:
df.head()

Unnamed: 0,COMMENT_ID,AUTHOR,CONTENT,CLASS
0,1,Brandon Pryor,I dont even watch it anymore i just come here ...,0
1,2,Chelsea Yun,i hate rap﻿,0
2,3,Sofia Aristizabal,"I loved, she is amazing.. OMG your eyes*_*﻿",0
3,4,said abdesalam,song is bad﻿,0
4,5,crazy girl,tension⤴︎⤴︎﻿,0


Data cleaning

In [98]:
df['CONTENT'] = df['CONTENT'] # .apply(clean_text)

In [99]:
df.head()

Unnamed: 0,COMMENT_ID,AUTHOR,CONTENT,CLASS
0,1,Brandon Pryor,I dont even watch it anymore i just come here ...,0
1,2,Chelsea Yun,i hate rap﻿,0
2,3,Sofia Aristizabal,"I loved, she is amazing.. OMG your eyes*_*﻿",0
3,4,said abdesalam,song is bad﻿,0
4,5,crazy girl,tension⤴︎⤴︎﻿,0


Create bag of words model, set a word must appear in at least 2 times and at most 70% of the comments to be included to fit the model

In [100]:
vectorizer = CountVectorizer(min_df=2, max_df = 0.7, stop_words = 'english', lowercase = True)
X = df['CONTENT']
Y = df['CLASS']

Seperate the train and test data. 20% of data for testing and 80% for training

In [101]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 11)

Use LR to fit the model

In [102]:
model = make_pipeline(vectorizer, StandardScaler(with_mean=False), LogisticRegression()) 
model.fit(X_train, Y_train)

Use the fitted model to predict the testing data and calculate the accuracy score

In [103]:
Y_pred = model.predict(X_test)
Y_pred = [1 if i >= 0.5 else 0 for i in Y_pred]
accuracy_on_training_data = accuracy_score(Y_test, Y_pred)

In [104]:
print(accuracy_on_training_data)

0.927007299270073


Calculate the f1 score

In [105]:
f1 = f1_score(Y_test, Y_pred)
print(f"F1-Score is {f1:.2f}")

F1-Score is 0.92


In [106]:
input_your_comment = ["check"]
prediction = model.predict(input_your_comment)
print(prediction)

[1]


Use the model to predict the test and generate the csv file

In [107]:
df2 = pd.read_csv("test.csv")
X1 = df2["CONTENT"].apply(clean_text)
Y1 = lasso.predict(X1)
Y1 = [1 if i >= 0.5 else 0 for i in Y1]

In [108]:
print(Y1)

[0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 

In [109]:
matrix_df = pd.DataFrame(Y1, columns=["CLASS"])
matrix_df["COMMENT_ID"]=df2["COMMENT_ID"]
matrix_df = matrix_df[["COMMENT_ID", "CLASS"]]

In [110]:
matrix_df.head()

Unnamed: 0,COMMENT_ID,CLASS
0,1370,0
1,1371,1
2,1372,1
3,1373,1
4,1374,0


In [111]:
matrix_df.to_csv("test_result11.csv", index=False)