In [3]:
import sklearn
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
sklearn_version = sklearn.__version__
sklearn_version

'0.22.1'

In [5]:
sentiment_data = pd.read_csv('datasets/sentimental_analysis_data.csv', header=None, names=['Label', 'Text'], sep='\t')
sentiment_data.sample(10) 

Unnamed: 0,Label,Text
6238,0,Brokeback Mountain is fucking horrible..
4372,0,"by the way, the Da Vinci Code sucked, just let..."
1631,1,I like Mission Impossible movies because you n...
5107,0,"yeah-I hate Harry Potter, I'll gladly play Dun..."
6010,0,I SOOOO HATED BROKEBACK MOUNTAIN!!..
6670,0,", she helped me bobbypin my insanely cool hat ..."
610,1,the people who are worth it know how much i lo...
5106,0,Yeah yuck I hate Harry Potter.
3241,1,"Anyway, thats why I love "" Brokeback Mountain."
5243,0,"Not because I hate Harry Potter, but because I..."


In [6]:
sentiment_data.shape

(6918, 2)

In [7]:
x = sentiment_data['Text']
y = sentiment_data['Label']

In [8]:
x_train, x_test,y_train,y_test = train_test_split(x, y, test_size=0.2)
x_train.shape

(5534,)

In [9]:
x_test.shape

(1384,)

In [10]:
tfidf_vect = TfidfVectorizer(max_features=15)
x_trans = tfidf_vect.fit_transform(x_train)
print(x_trans[0:3])

  (0, 1)	0.5112110842642678
  (0, 7)	0.46067933742979467
  (0, 6)	0.5130486211805841
  (0, 9)	0.5130486211805841
  (1, 0)	0.45010655465401805
  (1, 11)	0.42418492958914555
  (1, 5)	0.42418492958914555
  (1, 8)	0.45975965849725864
  (1, 7)	0.47556223241009876
  (2, 3)	0.5772422702519212
  (2, 13)	0.5774042610833453
  (2, 4)	0.5774042610833453


In [11]:
classifier = LinearSVC(C=1.0, max_iter=1000, tol=1e-3)
linear_svc_model = classifier.fit(x_trans,y_train)
linear_svc_model

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
          verbose=0)

In [12]:
x_test_trans = tfidf_vect.transform(x_test)
print(x_test_trans[0:3])

  (0, 11)	0.7071067811865475
  (0, 5)	0.7071067811865475
  (1, 14)	0.5015008538368719
  (1, 9)	0.500095047046361
  (1, 6)	0.500095047046361
  (1, 1)	0.49830390470102126
  (2, 12)	0.43690205273351207
  (2, 11)	0.4984282664840312
  (2, 7)	0.558797926496413
  (2, 5)	0.4984282664840312


In [13]:
y_pred = linear_svc_model.predict(x_test_trans)
print('Training Score :', linear_svc_model.score(x_trans, y_train))

Training Score : 0.8968196602818937


In [14]:
from sklearn.metrics import accuracy_score

print('Testing Score: ', accuracy_score(y_test, y_pred))

Testing Score:  0.8822254335260116


In [15]:
pred_results = pd.DataFrame({
    'y_pred': y_pred,
    'y_test': y_test
})
pred_results.sample(5)

Unnamed: 0,y_pred,y_test
6183,0,0
2774,0,1
1387,1,1
6302,0,0
1488,1,1


In [16]:
text_clf_param = {}
text_clf_param['preprocessing'] = tfidf_vect
text_clf_param['model'] = linear_svc_model
text_clf_param['sklearn_vesrion'] = sklearn.__version__
text_clf_param['accuracy_score'] = accuracy_score(y_test, y_pred)

In [17]:
text_clf_param

{'preprocessing': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=1.0, max_features=15,
                 min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                 smooth_idf=True, stop_words=None, strip_accents=None,
                 sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, use_idf=True, vocabulary=None),
 'model': LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
           intercept_scaling=1, loss='squared_hinge', max_iter=1000,
           multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
           verbose=0),
 'sklearn_vesrion': '0.22.1',
 'accuracy_score': 0.8822254335260116}

In [1]:
import joblib

path = 'models/text_clf_checkpoint.joblib'

In [18]:
joblib.dump(text_clf_param, path)

['models/text_clf_checkpoint.joblib']

In [25]:
reloaded_vect = joblib.load(path)
vect_preprocess = reloaded_vect['preprocessing']
vect_preprocess

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=15,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [23]:
vec_model = reloaded_vect['model']
vec_model

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
          verbose=0)

In [26]:
y_pred = vec_model.predict(vect_preprocess.transform(x_test))
y_pred

array([0, 1, 0, ..., 0, 1, 0])

In [27]:
print('Reloaded vect test score:', accuracy_score(y_test, y_pred))

Reloaded vect test score: 0.8822254335260116
