## tf-idf sparse matrix

In [1]:
import pandas as pd
import numpy as np
import os
import sys

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df=pd.read_csv("twitter30k_cleaned.csv")
df.head()

Unnamed: 0,twitts,sentiment
0,robbiebronniman sounds like a great night,1
1,damn the person who stolde my wallet may karma...,1
2,greetings from the piano bench photo,1
3,drewryanscott i love it i love you haha forget...,1
4,kissthestars pretty pretty pretty please pakid...,0


In [4]:
df.shape

(30000, 2)

In [5]:
df["sentiment"].value_counts()

1    15000
0    15000
Name: sentiment, dtype: int64

In [6]:
X=df["twitts"]
y=df["sentiment"]

In [7]:
tfidf=TfidfVectorizer()
X=tfidf.fit_transform(X)

In [8]:
X
#sparse matrix

<30000x42126 sparse matrix of type '<class 'numpy.float64'>'
	with 350520 stored elements in Compressed Sparse Row format>

In [9]:
tfidf.vocabulary_
#frequency in vocabulary

{'robbiebronniman': 30912,
 'sounds': 34066,
 'like': 21327,
 'great': 15018,
 'night': 25834,
 'damn': 9231,
 'the': 36430,
 'person': 27909,
 'who': 40389,
 'stolde': 34807,
 'my': 25090,
 'wallet': 39753,
 'may': 23147,
 'karma': 19616,
 'come': 7980,
 'back': 3584,
 'and': 2366,
 'bite': 4688,
 'you': 41635,
 'in': 17392,
 'ass': 3123,
 'greetings': 15068,
 'from': 13806,
 'piano': 28121,
 'bench': 4328,
 'photo': 28078,
 'drewryanscott': 10916,
 'love': 21980,
 'it': 17964,
 'haha': 15398,
 'forget': 13485,
 'hugyou': 16811,
 'should': 32921,
 'give': 14541,
 'me': 23268,
 'kissno': 20189,
 'lie': 21260,
 'please': 28418,
 'would': 40931,
 'be': 4019,
 'awesome': 3424,
 'if': 17159,
 'did': 10224,
 'kissthestars': 20190,
 'pretty': 28945,
 'pakidownload': 27326,
 'ito': 18013,
 'then': 36601,
 'reupload': 30620,
 'someother': 33896,
 'site': 33261,
 'mediafire': 23312,
 'hindi': 16299,
 'mgwork': 23651,
 'ang': 2439,
 'mu': 24903,
 'skin': 33323,
 'really': 29985,
 'upset': 38998,

In [10]:
len(tfidf.vocabulary_)

42126

In [11]:
X.shape, y.shape

((30000, 42126), (30000,))

In [12]:
type(X), type(y)

(scipy.sparse._csr.csr_matrix, pandas.core.series.Series)

In [13]:
sys.getsizeof(X)
#Return the size of an object in bytes

48

In [14]:
X.data

array([0.32137705, 0.33872744, 0.27319627, ..., 0.10368016, 0.10257841,
       0.08647948])

In [15]:
X.data.nbytes
#how many bytes they are 

2804160

In [16]:
d=(X.data.nbytes + X.indptr.nbytes + X.indices.nbytes)
d

4326244

In [17]:
f"{d/(2**20)} MB"

'4.125827789306641 MB'

In [18]:
f"Total element size: {X.shape[0]*X.shape[1]}"

'Total element size: 1263780000'

In [19]:
(X.shape[0]*X.shape[1])*8/(2**30)
# it would be 9 GB if it was a dense matrix

9.415894746780396

#### Non-negative matrix factorization (NMF)

In [20]:
import random
from sklearn.decomposition import NMF
seed=42
random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, stratify=y)

In [21]:
%%time
nmf=NMF(n_components=50, random_state=seed)
X_train_nmf=nmf.fit_transform(X_train)
#if you choose large number of components rather than 50, time will be much longer

CPU times: total: 51.7 s
Wall time: 50.4 s


In [22]:
X_train.shape, X_train_nmf.shape
#dimensionality reduction has been realized by NMF

((24000, 42126), (24000, 50))

In [23]:
def svm_model(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train,y_train)
    
    y_pred=clf.predict(X_test)
    
    print("Confusion matrix")
    cm=confusion_matrix(y_test,y_pred)
    print(cm)
    print("\n")
    print("Classification Report")
    cr=classification_report(y_test,y_pred)
    print(cr)

In [24]:
%%time
#reduced dimension data
clf=LinearSVC()

X_test_nmf=nmf.transform(X_test)

svm_model(clf,X_train_nmf,X_test_nmf, y_train, y_test)
# NMF can reduce the dimension, thus reduce the memory, you can save some space
# NMF is not a good choice for this problem

Confusion matrix
[[1859 1141]
 [ 979 2021]]


Classification Report
              precision    recall  f1-score   support

           0       0.66      0.62      0.64      3000
           1       0.64      0.67      0.66      3000

    accuracy                           0.65      6000
   macro avg       0.65      0.65      0.65      6000
weighted avg       0.65      0.65      0.65      6000

CPU times: total: 359 ms
Wall time: 331 ms


In [25]:
%%time
#Original dimension data
svm_model(clf,X_train,X_test, y_train, y_test)

Confusion matrix
[[2257  743]
 [ 672 2328]]


Classification Report
              precision    recall  f1-score   support

           0       0.77      0.75      0.76      3000
           1       0.76      0.78      0.77      3000

    accuracy                           0.76      6000
   macro avg       0.76      0.76      0.76      6000
weighted avg       0.76      0.76      0.76      6000

CPU times: total: 203 ms
Wall time: 202 ms


#### TSVD (Truncated singular value decomposition)

In [26]:
from sklearn.decomposition import TruncatedSVD as TSVD

In [27]:
%%time
# A lot faster than NMF
tsvd=TSVD(n_components=500,random_state=seed)
X_train_tsvd=tsvd.fit_transform(X_train)

CPU times: total: 49.4 s
Wall time: 20.1 s


In [28]:
sum(tsvd.explained_variance_)

0.3823003337385808

In [29]:
%%time
clf= LinearSVC()
X_test_tsvd=tsvd.transform(X_test)
svm_model(clf,X_train_tsvd,X_test_tsvd, y_train, y_test)
# with only 500 components, we could get the same accuracy as the original dimension data did

Confusion matrix
[[2153  847]
 [ 639 2361]]


Classification Report
              precision    recall  f1-score   support

           0       0.77      0.72      0.74      3000
           1       0.74      0.79      0.76      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000

CPU times: total: 1.88 s
Wall time: 1.88 s


In [30]:
X_train.shape, X_train_nmf.shape, X_train_tsvd.shape

((24000, 42126), (24000, 50), (24000, 500))

### Hyperparameters Tuning
At some point, you need to improve your model outputs. In order to do that, you need to specify manually parameters of model you have built to improve the accuracy, recall, and precision outputs as well as test results.

Example: learning rate in neural network or n_estimators in random forest classifiers.

They are not all the answers you are looking for your model.

There are 2 well-known tuning methods for hyperparameters
- Grid search
- Random search

### Logistic regression using GridSearch

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [32]:
train=X_train_tsvd.copy()
test=X_test_tsvd.copy()
# we are using the reduced dimensional data by tsvd
# we need to copy it in order to avoid mutable property

In [33]:
model=LogisticRegression(solver='liblinear',n_jobs=-1)
# using the'liblinear' solver since it supports both L1 and L2 regularization.

In [34]:
penalty=['l1','l2']
C=np.logspace(0,4,10)
max_iter=[100,500]

In [35]:
hyperparameters=dict(penalty=penalty,C=C,max_iter=max_iter)
print(hyperparameters)
# we need to define it by dictionary with the exact names used in the model

{'penalty': ['l1', 'l2'], 'C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04]), 'max_iter': [100, 500]}


In [36]:
clf=GridSearchCV(model,hyperparameters,cv=5, n_jobs=-1) # -1 means using all processors
print(clf)

GridSearchCV(cv=5, estimator=LogisticRegression(n_jobs=-1, solver='liblinear'),
             n_jobs=-1,
             param_grid={'C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04]),
                         'max_iter': [100, 500], 'penalty': ['l1', 'l2']})


In [37]:
best=clf.fit(train, y_train)



In [41]:
# return best model with parameters
print(best.best_estimator_)

LogisticRegression(C=2.7825594022071245, n_jobs=-1, solver='liblinear')


In [42]:
# return best model with parameters
print(best.best_params_)

{'C': 2.7825594022071245, 'max_iter': 100, 'penalty': 'l2'}


In [43]:
#Accuracy with the selected parameters
best.best_score_

0.7432916666666667

In [44]:
y_pred=clf.predict(test)

In [46]:
print("Classification report\n")
print(classification_report(y_test,y_pred))

Classification report

              precision    recall  f1-score   support

           0       0.77      0.72      0.74      3000
           1       0.74      0.78      0.76      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



### SVM using GridSearch
SVM hyperparameters
- C
- gamma
- kernel
- degree

In [47]:
from sklearn.svm import SVC

import warnings
warnings.filterwarnings("ignore")


from sklearn import datasets
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer

In [48]:
iris=datasets.load_iris()
X=iris.data
y=iris.target
# using iris dataset, because previous dataset is huge and we don't want gridseach to take too long
# eventually, this is an example on how to do fine-tune and gridsearch

In [49]:
seed

42

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = seed, stratify = y)

In [51]:
# Setting the parameters by cross-validation
hyperparameters = [
    {'kernel': ['rbf'],
     'gamma': [1e-1, 1e-2],
     'C': [1, 10],
     'degree': [2, 3]},
                   
    {'kernel': ['linear'],
     'C': [1, 10]}]

In [53]:
scores = ['precision', 'recall']

def tuning(model, hyperparameters, scores):
    for score in scores:
        print("Tuning hyperparameters for %s" %score)
        print()
        
        clf=GridSearchCV(model,hyperparameters,scoring="%s_macro" %score, cv=5, n_jobs=-1)
        clf.fit(X_train,y_train)
        
        print("Best parameters found:")
        print()
        print(clf.best_params_)
        
        print("Grid scores in process")
        print()
        means=clf.cv_results_["mean_test_score"]
        
        for mean, params in zip(means, clf.cv_results_["params"]):
            print("%0.3f for %r" % (mean,params))
        
        print()
        print("Classification report")
        y_pred=clf.predict(X_test)
        print(classification_report(y_test, y_pred))
        print()

In [54]:
#running function
tuning(SVC(), hyperparameters, scores)

Tuning hyperparameters for precision

Best parameters found:

{'C': 1, 'degree': 2, 'gamma': 0.1, 'kernel': 'rbf'}
Grid scores in process

0.985 for {'C': 1, 'degree': 2, 'gamma': 0.1, 'kernel': 'rbf'}
0.942 for {'C': 1, 'degree': 2, 'gamma': 0.01, 'kernel': 'rbf'}
0.985 for {'C': 1, 'degree': 3, 'gamma': 0.1, 'kernel': 'rbf'}
0.942 for {'C': 1, 'degree': 3, 'gamma': 0.01, 'kernel': 'rbf'}
0.978 for {'C': 10, 'degree': 2, 'gamma': 0.1, 'kernel': 'rbf'}
0.978 for {'C': 10, 'degree': 2, 'gamma': 0.01, 'kernel': 'rbf'}
0.978 for {'C': 10, 'degree': 3, 'gamma': 0.1, 'kernel': 'rbf'}
0.978 for {'C': 10, 'degree': 3, 'gamma': 0.01, 'kernel': 'rbf'}
0.985 for {'C': 1, 'kernel': 'linear'}
0.969 for {'C': 10, 'kernel': 'linear'}

Classification report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.90      0.90      0.90        10
           2       0.90      0.90      0.90        10

    accuracy                 

### Random forest GridSearch

Some of hyperparameters of Random forest are:
- **n_estimators**
- **max_depth** 
- **min_samples_split** 
- **min_samples_leaf** 
- **max_features** 

In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [56]:
scores = ['precision', 'recall']

hyperparameters = {
    'bootstrap': [True],
    'max_depth': [10, 100],
    'max_features': [2, 3, X.shape[1]],
    'min_samples_leaf': [2, 5],
    'n_estimators': [10, 100, 200]
}

In [59]:
X_train.shape, y_train.shape

((120, 4), (120,))

In [60]:
# Running tuning function
rfc=RandomForestClassifier(n_jobs=-1)
tuning(rfc, hyperparameters, scores)

Tuning hyperparameters for precision

Best parameters found:

{'bootstrap': True, 'max_depth': 10, 'max_features': 3, 'min_samples_leaf': 2, 'n_estimators': 200}
Grid scores in process

0.963 for {'bootstrap': True, 'max_depth': 10, 'max_features': 2, 'min_samples_leaf': 2, 'n_estimators': 10}
0.963 for {'bootstrap': True, 'max_depth': 10, 'max_features': 2, 'min_samples_leaf': 2, 'n_estimators': 100}
0.957 for {'bootstrap': True, 'max_depth': 10, 'max_features': 2, 'min_samples_leaf': 2, 'n_estimators': 200}
0.961 for {'bootstrap': True, 'max_depth': 10, 'max_features': 2, 'min_samples_leaf': 5, 'n_estimators': 10}
0.957 for {'bootstrap': True, 'max_depth': 10, 'max_features': 2, 'min_samples_leaf': 5, 'n_estimators': 100}
0.963 for {'bootstrap': True, 'max_depth': 10, 'max_features': 2, 'min_samples_leaf': 5, 'n_estimators': 200}
0.957 for {'bootstrap': True, 'max_depth': 10, 'max_features': 3, 'min_samples_leaf': 2, 'n_estimators': 10}
0.957 for {'bootstrap': True, 'max_depth': 10, 

### Random Search for hyperparameters

In [62]:
from scipy.stats import uniform, loguniform
from sklearn.model_selection import RandomizedSearchCV

In [66]:
C=uniform(loc=0,scale=4)
gamma=loguniform(1e-5,1e-1)
hyperparameters= dict(C=C,gamma=gamma)

In [68]:
hyperparameters
# randomizedsearch will pick randomly from this distribution and try to fit the model

{'C': <scipy.stats._distn_infrastructure.rv_frozen at 0x21c4a325f70>,
 'gamma': <scipy.stats._distn_infrastructure.rv_frozen at 0x21c4953e160>}

In [None]:
clf=RandomizedSearchCV(SVC(),hyperparameters,cv=5, n_jobs=-1)

In [70]:
print(clf.fit(X_train,y_train))

GridSearchCV(cv=5, estimator=LogisticRegression(n_jobs=-1, solver='liblinear'),
             n_jobs=-1,
             param_grid={'C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04]),
                         'max_iter': [100, 500], 'penalty': ['l1', 'l2']})


In [71]:
# best parameters from randomizedsearch
clf.best_params_

{'C': 21.544346900318832, 'max_iter': 100, 'penalty': 'l2'}

In [72]:
clf.best_score_

0.9666666666666668

In [73]:
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))
#

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        10

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



### Best model selection from many Machine Learning Models

In [76]:
from sklearn.pipeline import Pipeline
# we will perform it using pipeline

In [78]:
pipe = Pipeline([("classifier", RandomForestClassifier())])

search_space = [{"classifier": [LogisticRegression(solver='liblinear')],
                  "classifier__penalty": ['l1', 'l2'],
                  "classifier__C": np.logspace(0, 4, 10)},
                
                {"classifier": [RandomForestClassifier(n_jobs=-1)],
                  "classifier__n_estimators": [10, 100],
                  "classifier__max_features": [1, 3]},
                
                  {"classifier": [SVC()],
                  "classifier__C": [1, 5],
                  "classifier__gamma": [1e-1, 1e-2]}]

# search space is a list of dictionaries, which we want to use different models
# you can set hyperparameters of classifier by putting two underscore after classifier

In [79]:
clf=GridSearchCV(pipe,search_space,cv=5, n_jobs=-1)

In [81]:
print(clf.fit(X_train,y_train))

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('classifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid=[{'classifier': [LogisticRegression(solver='liblinear')],
                          'classifier__C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04]),
                          'classifier__penalty': ['l1', 'l2']},
                         {'classifier': [RandomForestClassifier(n_jobs=-1)],
                          'classifier__max_features': [1, 3],
                          'classifier__n_estimators': [10, 100]},
                         {'classifier': [SVC(C=1, gamma=0.1)],
                          'classifier__C': [1, 5],
                          'classifier__gamma': [0.1, 0.01]}])


In [83]:
print( clf.best_estimator_ )

Pipeline(steps=[('classifier', SVC(C=1, gamma=0.1))])


In [85]:
print( clf.best_params_ )

{'classifier': SVC(C=1, gamma=0.1), 'classifier__C': 1, 'classifier__gamma': 0.1}


In [None]:
# Using pipeline, You can check performance of every model while doing GridSearch