In [36]:
import sklearn
import pickle
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [7]:
sentiment_data = pd.read_csv('datasets/sentimental_analysis_data.csv',
                           header=None,
                           names=['Label', 'Text'],
                           sep='\t')
sentiment_data.sample(5)

Unnamed: 0,Label,Text
6516,0,Ok brokeback mountain is such a horrible movie.
258,1,The Da Vinci Code is awesome!!
3719,1,dudeee i LOVED brokeback mountain!!!!
843,1,The Da Vinci Code was absolutely AWESOME!
3021,1,= O I loved brokeback mountain and it made me ...


In [8]:
sentiment_data.shape

(6918, 2)

In [9]:
x = sentiment_data['Text']
y = sentiment_data['Label']
x.shape, y.shape

((6918,), (6918,))

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [11]:
x_train.shape, x_test.shape

((5534,), (1384,))

In [12]:
tfidf_vect = TfidfVectorizer(max_features=15)

In [13]:
logistic_clf = LogisticRegression(solver='liblinear')

In [14]:
clf_pipeline = Pipeline(steps=[
    ('tfidf_vect', tfidf_vect),
    ('classifier', logistic_clf)
])
pipeline_model = clf_pipeline.fit(x_train, y_train)
pipeline_model

Pipeline(memory=None,
         steps=[('tfidf_vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=15,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, interce

In [16]:
y_pred = pipeline_model.predict(x_test)
y_pred

array([1, 1, 0, ..., 0, 0, 0])

In [18]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8988439306358381

In [20]:
pickle.dump(pipeline_model, open('models/logistic_clf/model.pkl', 'wb'))

In [21]:
decision_tree_clf = DecisionTreeClassifier(max_depth=10)

In [22]:
clf_pipeline = Pipeline(steps=[
    ('tfidf_vect', tfidf_vect),
    ('classifier', decision_tree_clf)
])
pipeline_model = clf_pipeline.fit(x_train, y_train)

In [27]:
y_pred = pipeline_model.predict(x_test)
y_pred

array([1, 1, 0, ..., 0, 0, 0])

In [38]:
tree_accuracy = accuracy_score(y_test,y_pred)
tree_accuracy

0.9024566473988439

In [40]:
pickle.dump(pipeline_model, open('models/decision_tree_clf/model.pkl', 'wb'))

In [41]:
linear_svc_clf = LinearSVC(C=1.0, max_iter=100)

In [43]:
clf_pipeline = Pipeline(steps=[
    ('tfidf_vect', tfidf_vect),
    ('classifier', linear_svc_clf)
])
pipeline_model = clf_pipeline.fit(x_train, y_train)
pipeline_model

Pipeline(memory=None,
         steps=[('tfidf_vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=15,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('classifier',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
     

In [45]:
y_pred = pipeline_model.predict(x_test)
y_pred

array([1, 1, 0, ..., 0, 0, 0])

In [46]:
accuracy_score = accuracy_score(y_test, y_pred)
accuracy_score

0.898121387283237

In [48]:
pickle.dump(pipeline_model, open('models/linear_svc_clf/model.pkl', 'wb'))