In [2]:
import sklearn
import pickle
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
sentimental_data = pd.read_csv('datasets/sentimental_analysis_data.csv', 
                               header=None, 
                               names=['Label', 'Text'], 
                               sep='\t')

sentimental_data.sample(10)

Unnamed: 0,Label,Text
5045,0,I hate Harry Potter....
1650,1,i love being a sentry for mission impossible a...
3390,1,I either LOVE Brokeback Mountain or think it's...
318,1,The Da Vinci Code is awesome!!
6160,0,", she helped me bobbypin my insanely cool hat ..."
2384,1,Because I would like to make friends who like ...
5515,0,These Harry Potter movies really suck.
2530,1,I am going to start reading the Harry Potter s...
3321,1,"Anyway, thats why I love "" Brokeback Mountain."
5413,0,"Not because I hate Harry Potter, but because I..."


In [4]:
sentimental_data.shape

(6918, 2)

In [5]:
X = sentimental_data['Text']

Y = sentimental_data['Label']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [7]:
tfidf_vect = TfidfVectorizer(max_features=15)

#### Logistic Regression

In [8]:
logistic_clf = LogisticRegression(solver='liblinear')

In [9]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', logistic_clf)])

pipeline_model = clf_pipeline.fit(x_train, y_train)

In [10]:
y_pred = pipeline_model.predict(x_test)

In [11]:
Accuracy_score = accuracy_score(y_test, y_pred)

Accuracy_score

0.8966763005780347

In [12]:
pickle.dump(pipeline_model, open('models/logistic_clf/model.pkl', 'wb'))

#### Decision Tree Classifier

In [13]:
decision_tree_clf = DecisionTreeClassifier(max_depth=10)

In [14]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', decision_tree_clf)])

pipeline_model = clf_pipeline.fit(x_train, y_train)

In [15]:
y_pred = pipeline_model.predict(x_test)

In [16]:
Accuracy_score = accuracy_score(y_test, y_pred)

Accuracy_score

0.8937861271676301

In [17]:
pickle.dump(pipeline_model, open('models/decision_tree_clf/model.pkl', 'wb'))

#### SVC Classifier

In [18]:
linear_svc_clf = LinearSVC(C=1.0, max_iter=100)

In [19]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', linear_svc_clf)])

pipeline_model = clf_pipeline.fit(x_train, y_train)

In [20]:
y_pred = pipeline_model.predict(x_test)

In [21]:
Accuracy_score = accuracy_score(y_test, y_pred)

Accuracy_score

0.8973988439306358

In [22]:
pickle.dump(pipeline_model, open('models/linear_svc_clf/model.pkl', 'wb'))