## IMPORTING REQUIRED LIBRARIES

In [152]:
import itertools
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

## DATA LOADING

In [252]:
# Import dataset
df=pd.read_csv('C:\\Users\\shubham verma\\Downloads\\train.csv')

# Get the shape
df.shape

(20800, 5)

In [253]:
df.head(10)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
6,6,Life: Life Of Luxury: Elton John’s 6 Favorite ...,,Ever wonder how Britain’s most iconic pop pian...,1
7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
8,8,Excerpts From a Draft Script for Donald Trump’...,,Donald J. Trump is scheduled to make a highly ...,0
9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0


## DATA CLEANSING

### REMOVING NULL

In [254]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [255]:
df = df[['text','label']]

In [256]:
df = df.dropna()

In [257]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [258]:
df.shape

(20761, 2)

In [259]:
df['text'] = df['text'].apply(lambda x: x.lower())

### REMOVE PUNCTUATIONS

In [260]:
import string
string.punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~‘’”“—' # manually adding unwanted punctuations
def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_txt = ''.join(all_list)
    return clean_txt
df['text'] = df['text'].apply(punctuation_removal)

### REMOVE STOPWORDS

In [261]:
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
def stopwords_removal(text):
# NEW LIST TO REPLACE STOPWORDS MANUALLY HAVING ' ’ ' i.e(where don't is present as don’t)
    stop_words = ["aren't",'couldn',"couldn't","don't",'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',"hasn't",'haven',"haven't",'isn',"isn't",'ma','mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"should've","shouldn't",'wasn',"wasn't",'weren',"weren't",'won',"won't",'wouldn',"wouldn't"]
    stop = stopwords.words('english')
    stop_words=[x.replace("'","’") for x in stop_words]
    [stop.append(x) for x in stop_words]
    clean_txt = ' '.join([word for word in text.split() if word not in (stop)])
    return clean_txt
df['text'] = df['text'].apply(stopwords_removal)

### LEMMATIZATION

In [32]:
import spacy

In [33]:
sp = spacy.load('en_core_web_sm')

In [262]:
# LEMMATIZATION OF EACH WORD IN NEWS ARTICLE.
df['text'] = df['text'].apply(lambda x: ' '.join([word.lemma_ for word in sp(x)]))

In [110]:
# df['text'][0]

## TRAIN TEST SPLIT

In [263]:
# splitting the data into train and test sets
x_train,x_test,y_train,y_test=train_test_split(df['text'], df.label, test_size=0.2, random_state=123)

## TF-IDF  

In [264]:
#Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

In [265]:
# Fit & transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

## LOGISTIC REGRESSION

In [266]:
from sklearn.linear_model import LogisticRegression

In [267]:
clf = LogisticRegression(random_state=42,C=1)
clf.fit(tfidf_train,y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [268]:
# Predict and calculate accuracy
clf_pred=clf.predict(tfidf_test)
score=accuracy_score(y_test,clf_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 94.1%


## PASSIVE AGGRESSIVE CLASIFIER

In [269]:
from sklearn.linear_model import PassiveAggressiveClassifier

In [302]:
# Initialize the PassiveAggressiveClassifier and fit training sets
pa_classifier=PassiveAggressiveClassifier(max_iter=1000,early_stopping=True,random_state=42)
pa_classifier.fit(tfidf_train,y_train)

PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                            early_stopping=True, fit_intercept=True,
                            loss='hinge', max_iter=1000, n_iter_no_change=5,
                            n_jobs=None, random_state=42, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)

In [303]:
# Predict and calculate accuracy
y_pred=pa_classifier.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 95.5%


In [272]:
# Build confusion matrix
confusion_matrix(y_test,y_pred, labels=[0,1])

array([[1996,   92],
       [  95, 1970]], dtype=int64)

In [273]:
# OVERALL SCORES

print(f1_score(y_test, y_pred, average="macro"))
print(precision_score(y_test, y_pred, average="macro"))
print(recall_score(y_test, y_pred, average="macro"))

0.9549705442764145
0.9549751579560641
0.9549669273514978


In [274]:
# CLASS LEVEL SCORES FOR LABEL 0 AND LABEL 1

from sklearn.metrics import precision_recall_fscore_support as score
precision, recall, fscore, support = score(y_test, y_pred)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.95456719 0.95538312]
recall: [0.9559387  0.95399516]
fscore: [0.95525245 0.95468864]
support: [2088 2065]


## RANDOM FOREST

In [141]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [142]:
# Create and fit model 
rf_classifier = RandomForestClassifier(n_estimators = 100, random_state = 11) 
rf_classifier.fit(tfidf_train, y_train)  

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=11, verbose=0,
                       warm_start=False)

In [143]:
rf_preds = rf_classifier.predict(tfidf_test)
score=accuracy_score(y_test,rf_preds)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 89.36%


## DECISION TREE

In [144]:
from sklearn.tree import DecisionTreeClassifier

In [145]:
# Create and fit model 
dt_classifier = DecisionTreeClassifier(criterion= 'entropy',max_depth = 20, splitter='best',random_state=42) 
dt_classifier.fit(tfidf_train, y_train)  

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=20, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [146]:
dt_preds = dt_classifier.predict(tfidf_test)
score=accuracy_score(y_test,dt_preds)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 89.01%


## TEST DATA

FOLLOWING THE SAME PROCESS AS APPLIED ON TRAIN DATA.

In [174]:
# Import dataset
testdf=pd.read_csv('C:\\Users\\dharmesh pathak\\Downloads\\test.csv')
# Get the shape
testdf.shape

(5200, 4)

In [175]:
testdf

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
...,...,...,...,...
5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...
5196,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...
5197,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...
5198,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...


In [177]:
testdf = testdf[['id','text']]

In [178]:
# TOTAL NULL PRESENT IN DATA.
testdf.isnull().sum()

id      0
text    7
dtype: int64

In [179]:
# REMOVING ROWS WHERE TEXT IS NULL.
testdf = testdf.dropna()

In [187]:
# PUNCTUATION REMOVAL
testdf['text'] = testdf['text'].apply(punctuation_removal)

In [221]:
# STOPWORDS REMOVAL
testdf['text'] = testdf['text'].apply(stopwords_removal)

In [224]:
# LEMMATIZATION OF EACH WORD IN NEWS ARTICLE.
testdf['text'] = testdf['text'].apply(lambda x: ' '.join([word.lemma_ for word in sp(x)]))

In [225]:
testdf['text']

0       PALO ALTO Calif after year scorn political pro...
1       russian warship ready strike terrorist near Al...
2       video nodapl native american leader Vow Stay a...
3       if first do not succeed try different sport Ti...
4       42 min ago 1 view 0 comment 0 like for first t...
                              ...                        
5195    of dysfunction plague world megacitie none may...
5196    WASHINGTON Gov John Kasich Ohio Tuesday sign l...
5197    good morning want get California Today email H...
5198    « Previous Next » 300 US Marines to be deploy ...
5199    perhaps -PRON- have see new tv series whose pi...
Name: text, Length: 5193, dtype: object

In [228]:
# VECTORIZING THE TEST DATA
test_tfidf = tfidf_vectorizer.transform(testdf['text'])

In [236]:
# CLASSIFICATION USING TRAINED PASSIVE AGGRESSIVE CLASSIFIER.
test_pred=pa_classifier.predict(test_tfidf)

In [237]:
len(test_pred)

5193

In [238]:
# CREATING LABEL COLUMN HAVING PREDICTED VALUES.
testdf['label'] = test_pred

In [242]:
submit_df = testdf.drop('text',axis=1)

In [251]:
submit_df

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,1
3,20803,0
4,20804,1
...,...,...
5188,25995,1
5189,25996,0
5190,25997,0
5191,25998,1


In [249]:
# WRITE submit.csv to disk
submit_df.to_csv('C:\\Users\\shubham verma\\Downloads\\submit.csv',index=False)