In [1]:
import sklearn
import pickle
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
sentimental_data = pd.read_csv('datasets/sentimental_analysis_data.csv', 
                               header=None, 
                               names=['Label', 'Text'], 
                               sep='\t')

sentimental_data.sample(10)

Unnamed: 0,Label,Text
5183,0,"Not because I hate Harry Potter, but because I..."
2078,1,"Also: "" Sexy Harry Potter."
2387,1,I love Harry Potter..
6331,0,My dad's being stupid about brokeback mountain...
3921,1,"Anyway, thats why I love "" Brokeback Mountain."
3727,1,Brokeback Mountain was an AWESOME movie.
4090,0,Da Vinci Code sucked but the story and ideas w...
5399,0,"Is it just me, or does Harry Potter suck?..."
5407,0,This quiz sucks and Harry Potter sucks ok bye..
5220,0,"Always knows what I want, not guy crazy, hates..."


In [3]:
sentimental_data.shape

(6918, 2)

In [4]:
X = sentimental_data['Text']

Y = sentimental_data['Label']

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [6]:
tfidf_vect = TfidfVectorizer(max_features=15)

In [7]:
logistic_clf = LogisticRegression(solver='liblinear')

In [8]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', logistic_clf)])

pipeline_model = clf_pipeline.fit(x_train, y_train)

In [9]:
y_pred = pipeline_model.predict(x_test)

In [10]:
Accuracy_score = accuracy_score(y_test, y_pred)

Accuracy_score

0.8894508670520231

In [11]:
pickle.dump(pipeline_model, open('models/logistic_clf/model.pkl', 'wb'))

In [12]:
decision_tree_clf = DecisionTreeClassifier(max_depth=10)

In [13]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', decision_tree_clf)])

pipeline_model = clf_pipeline.fit(x_train, y_train)

In [14]:
y_pred = pipeline_model.predict(x_test)

In [15]:
Accuracy_score = accuracy_score(y_test, y_pred)

Accuracy_score

0.8858381502890174

In [16]:
pickle.dump(pipeline_model, open('models/decision_tree_clf/model.pkl', 'wb'))

In [17]:
linear_svc_clf = LinearSVC(C=1.0, max_iter=100)

In [18]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', linear_svc_clf)])

pipeline_model = clf_pipeline.fit(x_train, y_train)

In [19]:
y_pred = pipeline_model.predict(x_test)

In [20]:
Accuracy_score = accuracy_score(y_test, y_pred)

Accuracy_score

0.8894508670520231

In [21]:
pickle.dump(pipeline_model, open('models/linear_svc_clf/model.pkl', 'wb'))