In [None]:
import sklearn
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
sklearn_version = sklearn.__version__
sklearn_version

In [None]:
sentiment_data = pd.read_csv('datasets/sentimental_analysis_data.csv', header=None, names=['Label', 'Text'], sep='\t')
sentiment_data.sample(10) 

In [None]:
sentiment_data.shape

In [None]:
x = sentiment_data['Text']
y = sentiment_data['Label']

In [None]:
x_train, x_test,y_train,y_test = train_test_split(x, y, test_size=0.2)
x_train.shape

In [None]:
x_test.shape

In [None]:
tfidf_vect = TfidfVectorizer(max_features=15)
x_trans = tfidf_vect.fit_transform(x_train)
print(x_trans[0:3])

In [None]:
classifier = LinearSVC(C=1.0, max_iter=1000, tol=1e-3)
linear_svc_model = classifier.fit(x_trans,y_train)
linear_svc_model

In [None]:
x_test_trans = tfidf_vect.transform(x_test)
print(x_test_trans[0:3])

In [None]:
y_pred = linear_svc_model.predict(x_test_trans)
print('Training Score :', linear_svc_model.score(x_trans, y_train))

In [None]:
from sklearn.metrics import accuracy_score

print('Testing Score: ', accuracy_score(y_test, y_pred))

In [None]:
pred_results = pd.DataFrame({
    'y_pred': y_pred,
    'y_test': y_test
})
pred_results.sample(5)

In [None]:
text_clf_param = {}
text_clf_param['preprocessing'] = tfidf_vect
text_clf_param['model'] = linear_svc_model
text_clf_param['sklearn_vesrion'] = sklearn.__version__
text_clf_param['accuracy_score'] = accuracy_score(y_test, y_pred)

In [None]:
text_clf_param

In [None]:
import joblib

path = 'models/text_clf_checkpoint.joblib'

In [None]:
joblib.dump(text_clf_param, path)

In [None]:
reloaded_vect = joblib.load(path)
vect_preprocess = reloaded_vect['preprocessing']
vect_preprocess

In [None]:
vec_model = reloaded_vect['model']
vec_model

In [None]:
y_pred = vec_model.predict(vect_preprocess.transform(x_test))
y_pred

In [None]:
print('Reloaded vect test score:', accuracy_score(y_test, y_pred))

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
clf_pipeline = Pipeline(steps=[
    ('tfidf_vect', tfidf_vect),
    ('classifier', classifier)
])
pipeline_model = clf_pipeline.fit(x_train, y_train)
pipeline_model

In [None]:
y_pred = pipeline_model.predict(x_test)
y_pred

In [None]:
print('Pipeline Test Score :', accuracy_score(y_test, y_pred))

In [None]:
pipe_clf_param = {}
pipe_clf_param['pipeline_clf'] = pipeline_model
pipe_clf_param['sklearn_version'] = sklearn_version
pipe_clf_param['accuracy'] = accuracy_score(y_test, y_pred)
pipe_clf_param

In [None]:
filename = 'models/pipe_clf_checkpoint.joblib'

In [None]:
joblib.dump(pipe_clf_param, filename)

In [None]:
reloaded_param = joblib.load(filename)
reloaded_param