# Clasic Machine Learning

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer
)
from ast import literal_eval


# Path to train and test files
train_path = '../data/train_E6oV3lV.csv'
test_path = '../data/test_tweets_anuFYb8.csv'

train  = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# Processed data
df = pd.read_csv('../data/pandas_data_frame.csv', index_col=0)
all_data = df.where((pd.notnull(df)), '')
all_data['hashtag'] = all_data['hashtag'].apply(literal_eval)

# bag-of-words feature matrix
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
bow = bow_vectorizer.fit_transform(all_data['tidy_tweet'])

# TF-IDF feature matrix
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(all_data['tidy_tweet'])


train_bow = bow[:31962,:]
test_bow = bow[31962:,:]

# splitting data into training and validation set
xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, train['label'], random_state=42, test_size=0.3)

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Logistic Regression with BOW
lreg = LogisticRegression()
lreg.fit(xtrain_bow, ytrain) # training the model

prediction = lreg.predict_proba(xvalid_bow) # predicting on the validation set
prediction_int = prediction[:,1] >= 0.3 # if prediction is greater than or equal to 0.3 than 1 else 0
prediction_int = prediction_int.astype(np.int)

print("Logistic Regression with BOW f1: {}".format(f1_score(yvalid, prediction_int)))

# Logistic Regression with TF-IDF
train_tfidf = tfidf[:31962,:]
test_tfidf = tfidf[31962:,:]

xtrain_tfidf = train_tfidf[ytrain.index]
xvalid_tfidf = train_tfidf[yvalid.index]

lreg.fit(xtrain_tfidf, ytrain)

prediction = lreg.predict_proba(xvalid_tfidf)
prediction_int = prediction[:,1] >= 0.3
prediction_int = prediction_int.astype(np.int)

print("Logistic Regression with TF-IDF f1: {}".format(f1_score(yvalid, prediction_int)))

Logistic Regression with BOW f1: 0.42742653606411396
Logistic Regression with TF-IDF f1: 0.4140550807217474


In [3]:
from sklearn.svm import LinearSVC
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# TF-IDF

"""KNeighbors
neigh = KNeighborsClassifier(n_neighbors=10)
neigh = neigh.fit(xtrain_tfidf.toarray(), ytrain)
y_pred = neigh.predict(xvalid_tfidf.toarray())
print("KNeighbors with TF-IDF f1: {}".format(f1_score(yvalid, y_pred)))
"""

"""Naive Bayes Gaussian"""
gnb = GaussianNB()
gnb = gnb.fit(xtrain_tfidf.toarray(), ytrain)
y_pred = gnb.predict(xvalid_tfidf.toarray())
print("Naive Bayes Gaussian with TF-IDF f1: {}".format(f1_score(yvalid, y_pred)))


"""Decision Tree Classifier"""
clf = tree.DecisionTreeClassifier()
clf = clf.fit(xtrain_tfidf, ytrain)  
y_pred = clf.predict(xvalid_tfidf)
print("Decision Tree Classifier with TF-IDF f1: {}".format(f1_score(yvalid, y_pred)))


"""SVM"""
svm = LinearSVC()
svm.fit(xtrain_tfidf, ytrain)  

y_pred = svm.predict(xvalid_tfidf)
print("SVM with TF-IDF f1: {}".format(f1_score(yvalid, y_pred)))

Naive Bayes Gaussian with TF-IDF f1: 0.1775480059084195
Decision Tree Classifier with TF-IDF f1: 0.471195184866724
SVM with TF-IDF f1: 0.3795620437956205


In [4]:
# BOW

"""KNeighbors
neigh = KNeighborsClassifier(n_neighbors=10)
neigh = neigh.fit(xtrain_bow.toarray(), ytrain)
y_pred = neigh.predict(xvalid_bow.toarray())
print("KNeighbors with TF-IDF f1: {}".format(f1_score(yvalid, y_pred)))
"""

"""Naive Bayes Gaussian"""
gnb = GaussianNB()
gnb = gnb.fit(xtrain_bow.toarray(), ytrain)
y_pred = gnb.predict(xvalid_bow.toarray())
print("Naive Bayes Gaussian with TF-IDF f1: {}".format(f1_score(yvalid, y_pred)))


"""Decision Tree Classifier"""
clf = tree.DecisionTreeClassifier()
clf = clf.fit(xtrain_bow, ytrain)  
y_pred = clf.predict(xvalid_bow)
print("Decision Tree Classifier with TF-IDF f1: {}".format(f1_score(yvalid, y_pred)))


"""SVM"""
svm = LinearSVC()
svm.fit(xtrain_bow, ytrain)  

y_pred = svm.predict(xvalid_bow)
print("SVM with TF-IDF f1: {}".format(f1_score(yvalid, y_pred)))

Naive Bayes Gaussian with TF-IDF f1: 0.17315646840678933
Decision Tree Classifier with TF-IDF f1: 0.4461190655614168
SVM with TF-IDF f1: 0.39390862944162436


In [5]:
from scipy.sparse import hstack

# Stack BOW and TF-IDF
x_val = hstack([xvalid_tfidf,xvalid_bow])
x_train = hstack([xtrain_tfidf,xtrain_bow])

"""Logistic Regression"""
lreg = LogisticRegression()
lreg.fit(x_train, ytrain)
prediction = lreg.predict_proba(x_val)
prediction_int = prediction[:,1] >= 0.3
prediction_int = prediction_int.astype(np.int)

print("Logistic Regression with TF-IDF and BOW f1: {}".format(f1_score(yvalid, prediction_int)))

"""SVM"""
svm = LinearSVC()
svm.fit(x_train, ytrain)  

y_pred = svm.predict(x_val)
print("SVM with TF-IDF and BOW  f1: {}".format(f1_score(yvalid, y_pred)))


"""Decision Tree Classifier"""
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train, ytrain)  
y_pred = clf.predict(x_val)
print("Decision Tree Classifier with TF-IDF and BOW f1: {}".format(f1_score(yvalid, y_pred)))

Logistic Regression with TF-IDF and BOW f1: 0.43948497854077256
SVM with TF-IDF and BOW  f1: 0.401174168297456
Decision Tree Classifier with TF-IDF and BOW f1: 0.4756944444444445
