In [1]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV


In [2]:
df = pd.read_csv('../../data/100_sentiment_analysis_sentences.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    100 non-null    object
 1   label   100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB


In [4]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')

# We transform each complaint into a vector
features = tfidf.fit_transform(df.text)

labels = df.label

print("Each of the %d label is represented by %d features (TF-IDF score of unigrams and bigrams)" %(features.shape))

Each of the 100 label is represented by 61 features (TF-IDF score of unigrams and bigrams)


In [5]:
df_tfidf = pd.DataFrame(features.toarray(), columns=tfidf.get_feature_names(), index=df.index)
df_tfidf['label'] = df['label']
df_tfidf.head()
df_tfidf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 62 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   2030               100 non-null    float64
 1   2035               100 non-null    float64
 2   access             100 non-null    float64
 3   access reuters     100 non-null    float64
 4   agency             100 non-null    float64
 5   battery            100 non-null    float64
 6   black              100 non-null    float64
 7   brand              100 non-null    float64
 8   california         100 non-null    float64
 9   car                100 non-null    float64
 10  carb               100 non-null    float64
 11  com                100 non-null    float64
 12  com register       100 non-null    float64
 13  company            100 non-null    float64
 14  department         100 non-null    float64
 15  did                100 non-null    float64
 16  electric           100 non-

In [6]:
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features.toarray(), labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [7]:
mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
std_accuracy = cv_df.groupby('model_name').accuracy.std()

acc = pd.concat([mean_accuracy, std_accuracy], axis= 1, 
          ignore_index=True)
acc.columns = ['Mean Accuracy', 'Standard deviation']
acc

Unnamed: 0_level_0,Mean Accuracy,Standard deviation
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
LinearSVC,0.49,0.138744
LogisticRegression,0.54,0.194936
MultinomialNB,0.49,0.174642
RandomForestClassifier,0.51,0.204328


In [8]:
X_train, X_test, y_train, y_test,indices_train,indices_test = train_test_split(features, 
                                                               labels, 
                                                               df.index, test_size=0.25, 
                                                               random_state=1)
model = LogisticRegression()
'''
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))'''



'\nsolvers = [\'newton-cg\', \'lbfgs\', \'liblinear\']\npenalty = [\'l2\']\nc_values = [100, 10, 1.0, 0.1, 0.01]\n# define grid search\ngrid = dict(solver=solvers,penalty=penalty,C=c_values)\ncv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\ngrid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring=\'accuracy\',error_score=0)\ngrid_result = grid_search.fit(X_train, y_train)\n# summarize results\nprint("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))'

In [9]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [10]:
print('\t\t\t\tCLASSIFICATIION METRICS\n')
print(metrics.classification_report(y_test, y_pred, 
                                    target_names= df['label'].unique()))

				CLASSIFICATIION METRICS

              precision    recall  f1-score   support

    POSITIVE       0.57      0.40      0.47        10
    NEGATIVE       0.33      0.40      0.36         5
     NEUTRAL       0.58      0.70      0.64        10

    accuracy                           0.52        25
   macro avg       0.50      0.50      0.49        25
weighted avg       0.53      0.52      0.52        25



In [11]:
# save the model to disk
pickle.dump(model, open( '../models/tfidf_custom_ml.pkl', 'wb'))

In [12]:
pickle.dump(tfidf, open("../models/tfidf_vectorizer_custom_ml.pkl", "wb"))

In [13]:
saved_model = pickle.load(open('../models/tfidf_custom_ml.pkl', 'rb'))
saved_tfidf =  pickle.load(open('../models/tfidf_vectorizer_custom_ml.pkl', 'rb'))

test_input = pd.DataFrame(tfidf.transform(["test_input"]).toarray(), columns=tfidf.get_feature_names())
new_prediction = saved_model.predict(test_input)
new_prediction

array(['NEUTRAL'], dtype=object)