In [1]:
import pandas as pd, numpy as np, re, time
from nltk.stem.porter import PorterStemmer

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [5]:
data = pd.read_csv('Dataset/Train.csv')

In [7]:
data

Unnamed: 0,ID,comment,date,down,parent_comment,score,top,topic,user,label
0,uid_590555,"Well, let's be honest here, they don't actuall...",2015-04,0,They should shut the fuck up and let the commu...,2,2,starcitizen,Combat_Wombatz,0
1,uid_671762,"Well, I didn't need evidence to believe in com...",2016-12,-1,You need evidence to kill people? I thought we...,6,-1,EnoughCommieSpam,starkadd,1
2,uid_519689,"Who does an ""official promo"" in 360p?",2013-11,0,2014 BMW S1000R: Official Promo,3,3,motorcycles,phybere,0
3,uid_788362,Grotto koth was the best,2015-09,0,Not really that memorable lol if you want memo...,2,2,hcfactions,m0xyMC,1
4,uid_299252,Neal's back baby,2015-11,0,James Neal hit on Zach Parise,-5,-5,hockey,Somuch101,1
...,...,...,...,...,...,...,...,...,...,...
14995,uid_845344,Well with a name like El Cubano I'm surprised ...,2015-01,0,There's two things you don't do in Florida. - ...,18,18,hockey,shutupisaac,0
14996,uid_757880,... This is a good point.,2014-04,0,Sounds like a pretty good overall summary of o...,6,6,hockey,em483,0
14997,uid_724706,Yep.,2015-09,0,"I know the type you speak of. The ""die cis scu...",2,2,AskReddit,YoImAli,0
14998,uid_1006984,That's what the government WANTS you to believe!,2016-01,0,That there's A hidden cure for cancer but phar...,1,1,AskReddit,OhHiGCHQ,1


In [8]:
print(data.isnull().any(axis = 0)) #no null data is present in the dataset

ID                False
comment           False
date              False
down              False
parent_comment    False
score             False
top               False
topic             False
user              False
label             False
dtype: bool


In [9]:
# Relacing special symbols and digits in headline column
# re stands for Regular Expression
data['comment'] = data['comment'].apply(lambda s : re.sub('[^a-zA-Z]', ' ', s))

In [10]:
data

Unnamed: 0,ID,comment,date,down,parent_comment,score,top,topic,user,label
0,uid_590555,Well let s be honest here they don t actuall...,2015-04,0,They should shut the fuck up and let the commu...,2,2,starcitizen,Combat_Wombatz,0
1,uid_671762,Well I didn t need evidence to believe in com...,2016-12,-1,You need evidence to kill people? I thought we...,6,-1,EnoughCommieSpam,starkadd,1
2,uid_519689,Who does an official promo in p,2013-11,0,2014 BMW S1000R: Official Promo,3,3,motorcycles,phybere,0
3,uid_788362,Grotto koth was the best,2015-09,0,Not really that memorable lol if you want memo...,2,2,hcfactions,m0xyMC,1
4,uid_299252,Neal s back baby,2015-11,0,James Neal hit on Zach Parise,-5,-5,hockey,Somuch101,1
...,...,...,...,...,...,...,...,...,...,...
14995,uid_845344,Well with a name like El Cubano I m surprised ...,2015-01,0,There's two things you don't do in Florida. - ...,18,18,hockey,shutupisaac,0
14996,uid_757880,This is a good point,2014-04,0,Sounds like a pretty good overall summary of o...,6,6,hockey,em483,0
14997,uid_724706,Yep,2015-09,0,"I know the type you speak of. The ""die cis scu...",2,2,AskReddit,YoImAli,0
14998,uid_1006984,That s what the government WANTS you to believe,2016-01,0,That there's A hidden cure for cancer but phar...,1,1,AskReddit,OhHiGCHQ,1


In [20]:
# getting features and labels
features = data['comment']
labels = data['label']

In [13]:
# Stemming our data
ps = PorterStemmer()
features = features.apply(lambda x: x.split())
features = features.apply(lambda x : ' ' .join([ps.stem(word) for word in x]))

In [14]:
# vectorizing the data with maximum of 5000 features
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features = 5000)
features = list(features)
features = tv.fit_transform(features).toarray()

In [15]:
# getting training and testing data
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = .05, random_state = 0)

In [16]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(features_train, labels_train)
print(lr.score(features_train, labels_train))  
print(lr.score(features_test, labels_test))     



0.7547368421052632
0.6653333333333333


In [17]:
# Random Forest Classifier
rfc = RandomForestClassifier(n_estimators = 10, random_state = 0)
rfc.fit(features_train, labels_train)
print(rfc.score(features_train, labels_train))  
print(rfc.score(features_test, labels_test))    

0.9715789473684211
0.644


In [24]:
#Predicting using the test dataset
data_set_for_submission = pd.read_csv('Dataset/Test.csv')

In [25]:
submission_test_features =  data_set_for_submission['comment']

In [27]:
features_test = list(submission_test_features)
features_test = tv.fit_transform(features_test).toarray()

In [32]:
final_prediction = rfc.predict(features_test) #numpy.ndarray for the prediction values on the test data

In [43]:
print(final_prediction)

df_submission = pd.DataFrame(data=final_prediction,columns=["label"])

[1 1 1 ... 0 0 1]


In [52]:
df_submission.head() # 

type(df_submission)

pandas.core.frame.DataFrame

In [53]:
# Format predictions in DataFrame: prediction_df

ID_test_dataset  = data_set_for_submission['ID']





In [55]:
#merge predicted output 
prediction_submission_df = pd.merge(ID_test_dataset, df_submission, left_index=True, right_index=True)

In [56]:
prediction_submission_df.head()

Unnamed: 0,ID,label
0,uid_764784,1
1,uid_67552,1
2,uid_240490,1
3,uid_56568,1
4,uid_875860,0


In [59]:
# Save prediction_df to csv
prediction_submission_df.to_csv('predictions_submission_final.csv')

In [60]:
prediction_submission_df.to_csv('predictions_submission_final_without_index.csv',index=False)