In [1]:
# import basic machine learning libraries 

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sea 

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,KFold,GridSearchCV

In [4]:
from sklearn.metrics import accuracy_score

In [5]:
# import nltk libraies for text preprocessing 

import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
import re

In [6]:
stop_words=stopwords.words('english')
stemmer=PorterStemmer()
lemitizer=WordNetLemmatizer()

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [21]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [9]:
tfidf=TfidfVectorizer()
bag=CountVectorizer()

In [10]:
df=pd.read_csv('train.csv')

In [11]:
df_final=df[['title','label']]

In [12]:
df_final.head()

Unnamed: 0,title,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,Why the Truth Might Get You Fired,1
3,15 Civilians Killed In Single US Airstrike Hav...,1
4,Iranian woman jailed for fictional unpublished...,1


In [13]:
df_final=df_final.dropna()

In [14]:
import re

def clean_text(text):
 
    
    
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)    
    text = text.strip().lower()
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

In [15]:
df_final['title']=df_final['title'].apply(lambda x:clean_text(x))

In [16]:
df_final.head()

Unnamed: 0,title,label
0,house dem aide we didn’t even see comey’s let...,1
1,flynn hillary clinton big woman on campus ...,0
2,why the truth might get you fired,1
3,15 civilians killed in single us airstrike hav...,1
4,iranian woman jailed for fictional unpublished...,1


In [17]:
def cleaning(text):
    var1=[]
    for i in word_tokenize(text):
        var1.append(i)
    var2=[]
    for i in var1:
        if i not in stop_words:
            var2.append(i)
    var3=[]
    for i in var2:
        var3.append(stemmer.stem(i))
    var4=[]
    for i in var3:
        var4.append(lemitizer.lemmatize(i))
        
    return var4

In [18]:
df_final['title']=df_final['title'].apply(lambda x:cleaning(x))

In [19]:
df_final.head(10)

Unnamed: 0,title,label
0,"[hous, dem, aid, ’, even, see, comey, ’, lette...",1
1,"[flynn, hillari, clinton, big, woman, campu, b...",0
2,"[truth, might, get, fire]",1
3,"[15, civilian, kill, singl, u, airstrik, ident...",1
4,"[iranian, woman, jail, fiction, unpublish, sto...",1
5,"[jacki, mason, hollywood, would, love, trump, ...",0
6,"[life, life, luxuri, elton, john, ’, 6, favori...",1
7,"[benoît, hamon, win, french, socialist, parti,...",0
8,"[excerpt, draft, script, donald, trump, ’, q, ...",0
9,"[back, channel, plan, ukrain, russia, courtesi...",0


In [20]:
df_final['title']=df_final['title'].apply(lambda x:' '.join(x))

In [22]:
df_final.head()

Unnamed: 0,title,label
0,hous dem aid ’ even see comey ’ letter jason c...,1
1,flynn hillari clinton big woman campu breitbart,0
2,truth might get fire,1
3,15 civilian kill singl u airstrik identifi,1
4,iranian woman jail fiction unpublish stori wom...,1


In [23]:
x=df_final['title']
y=df_final['label']

In [24]:
x_bag=bag.fit_transform(x)
x_tfidf=tfidf.fit_transform(x)

# For Bag Of Word

In [39]:
x_train,x_test,y_train,y_test=train_test_split(x_bag,y,random_state=0,test_size=0.25)

In [40]:
x_train.shape,y_train.shape,x_test.shape,y_test.shape

((15181, 16458), (15181,), (5061, 16458), (5061,))

In [41]:
models =[('LR',LogisticRegression()),
        ('RF',RandomForestClassifier(n_estimators=5)),
        ('NV',BernoulliNB()),
        ('DT',DecisionTreeClassifier())]

In [42]:
results=[]
names=[]
for name,model in models:
    kfold=KFold(n_splits=10,shuffle=True,random_state=0)
    cv_results=cross_val_score(model,x_train,y_train,cv=kfold,scoring='accuracy')
    names.append(name)
    results.append(cv_results)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.935116 (0.005606)
RF: 0.925433 (0.005695)
NV: 0.923918 (0.006010)
DT: 0.920032 (0.006675)


In [43]:
model=LogisticRegression()
model.fit(x_train,y_train)

LogisticRegression()

In [44]:
y_pred=model.predict(x_test)

In [45]:
accuracy_score(y_test,y_pred)

0.9338075479154317

# For tfidf

In [46]:
x_train,x_test,y_train,y_test=train_test_split(x_tfidf,y,random_state=0,test_size=0.25)

In [47]:
x_train.shape,y_train.shape,x_test.shape,y_test.shape

((15181, 16458), (15181,), (5061, 16458), (5061,))

In [48]:
models =[('LR',LogisticRegression()),
        ('RF',RandomForestClassifier(n_estimators=5)),
        ('NV',BernoulliNB()),
        ('DT',DecisionTreeClassifier())]

In [49]:
results=[]
names=[]
for name,model in models:
    kfold=KFold(n_splits=10,shuffle=True,random_state=0)
    cv_results=cross_val_score(model,x_train,y_train,cv=kfold,scoring='accuracy')
    names.append(name)
    results.append(cv_results)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.922666 (0.004703)
RF: 0.917857 (0.007377)
NV: 0.923918 (0.006010)
DT: 0.913840 (0.005984)


In [50]:
model=BernoulliNB()

model.fit(x_train,y_train)

BernoulliNB()

In [51]:
y_pred=model.predict(x_test)

In [52]:
accuracy_score(y_test,y_pred)

0.9207666469077257