In [9]:
import pandas as pd
import numpy as np
import scipy
from utils import load_data, preprocess_tweet_text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


Preparing dataframe

In [10]:
bodies = load_data("bodies")
stances = load_data("stances")
df = pd.merge(stances, bodies, on=['Body ID'])
columns_titles = ["Headline","articleBody", "Stance"]
df=df.reindex(columns=columns_titles)

In [11]:
df.Stance.unique()

array(['unrelated', 'discuss', 'agree', 'disagree'], dtype=object)

Encoding Stance value to numeric

In [12]:
df = df.replace({'Stance' : { 'unrelated' : 0, 'disagree' : 1, 'discuss' : 2, 'agree' : 3 }})

In [13]:
df.Headline = df['Headline'].apply(preprocess_tweet_text)
df.articleBody = df['articleBody'].apply(preprocess_tweet_text)

In [14]:
df

Unnamed: 0,Headline,articleBody,Stance
0,Police find mass graves least 15 bodies near M...,Danny Boyle directing untitled film Seth Rogen...,0
1,Seth Rogen Play Apple ’ Steve Wozniak,Danny Boyle directing untitled film Seth Rogen...,2
2,Mexico police find mass grave near site 43 stu...,Danny Boyle directing untitled film Seth Rogen...,0
3,Mexico Says Missing Students Not Found In Firs...,Danny Boyle directing untitled film Seth Rogen...,0
4,New iOS 8 bug delete iCloud documents,Danny Boyle directing untitled film Seth Rogen...,0
...,...,...,...
49967,Amazon Is Opening BrickandMortar Store Manhattan,Amazon cyber store sells everything plans open...,3
49968,Elon University banned term ‘ freshman ’ despi...,ELON NC – A recent rumor claims Elon Universit...,3
49969,Fake BBC News website set carry Charlie Hebdo ...,A realisticlooking fake BBC News website set c...,3
49970,Apple forced nix key health features smartwatc...,The healthfocused smartwatch Apple initially e...,2


In [15]:
tfidf = TfidfVectorizer(max_features = 50000, 
                              ngram_range = (1,3),
                              stop_words = "english")
X_Headline = tfidf.fit_transform(df["Headline"].tolist())
X_Body = tfidf.fit_transform(df["articleBody"].tolist())
X = scipy.sparse.hstack((X_Headline, 
                         X_Body,
                         )).tocsr()
y = np.array(df.iloc[:, 2]).ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

Training Logistics Regression model

In [16]:
LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(X_train, y_train)
y_predict_lr = LR_model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Naive Bayes model

In [17]:
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
y_predict_nb = NB_model.predict(X_test)

Comparing accuracies:

In [18]:
print("Logistics Regression accuracy:", accuracy_score(y_test, y_predict_lr))
print("Naive Bayes:", accuracy_score(y_test, y_predict_nb))

Logistics Regression accuracy: 0.8255823261026175
Naive Bayes: 0.6842231649723846
