In [1]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
#load csv file and label column
import pandas as pd 
data = pd.read_csv('analysis.csv',encoding='windows-1252')
data = data[['sentiment', 'text']]
data.columns = ['label', 'text']
data['label'] = data['label'].str.strip().str.lower()
print(data.head()) 
#print(data['label'].value_counts())
len(data)

      label                                               text
0   neutral                I`d have responded, if I were going
1  negative      Sooo SAD I will miss you here in San Diego!!!
2  negative                          my boss is bullying me...
3  negative                     what interview! leave me alone
4  negative   Sons of ****, why couldn`t they put them on t...


27481

In [3]:
#clean text
import re 
import string 

alphanumeric = lambda x: re.sub(r"""\w*\d\w*""", ' ', x) 
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation),' ',x.lower()) 

data['text'] = data['text'].astype(str).map(alphanumeric).map(punc_lower)
print(data.head())

      label                                               text
0   neutral                i d have responded  if i were going
1  negative      sooo sad i will miss you here in san diego   
2  negative                          my boss is bullying me   
3  negative                     what interview  leave me alone
4  negative   sons of       why couldn t they put them on t...


In [4]:
#split data into features and labels
x = data.text
y = data.label

In [5]:
x.head()

0                  i d have responded  if i were going
1        sooo sad i will miss you here in san diego   
2                            my boss is bullying me   
3                       what interview  leave me alone
4     sons of       why couldn t they put them on t...
Name: text, dtype: object

In [6]:
y.head()

0     neutral
1    negative
2    negative
3    negative
4    negative
Name: label, dtype: object

In [7]:
#split into test and training data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state = 42)

In [8]:
x_train.head()

23738    had a nice visit last night from a boy with vo...
26930                  never mind it s closed  sorry miley
9119      i won t leave you alone until you accept my a...
25447         not gonna ask what a boom boom is   morning 
25158    btw that song is you ll always find ur way bac...
Name: text, dtype: object

In [9]:
x_train.shape

(19236,)

In [10]:
y_train.head()

23738    positive
26930    negative
9119     positive
25447     neutral
25158     neutral
Name: label, dtype: object

In [11]:
y_train.shape

(19236,)

In [12]:
x_test.shape

(8245,)

In [13]:
y_test.shape

(8245,)

In [14]:
#Numerically encode the input data[Code]
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')
x_train_cv = cv.fit_transform(x_train)
x_test_cv = cv.transform(x_test)

#(trained text_messages, terms)
print(x_train_cv.shape)

(19236, 19182)


In [15]:
import joblib
joblib.dump(cv, 'countvectorizer.joblib')

['countvectorizer.joblib']

In [16]:
type(x_train_cv)
import scipy.sparse
pd.DataFrame.sparse.from_spmatrix(x_test_cv)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19172,19173,19174,19175,19176,19177,19178,19179,19180,19181
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8240,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8241,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8242,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8243,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
help(cv.fit_transform)

Help on method fit_transform in module sklearn.feature_extraction.text:

fit_transform(raw_documents, y=None) method of sklearn.feature_extraction.text.CountVectorizer instance
    Learn the vocabulary dictionary and return document-term matrix.

    This is equivalent to fit followed by transform, but more efficiently
    implemented.

    Parameters
    ----------
    raw_documents : iterable
        An iterable which generates either str, unicode or file objects.

    y : None
        This parameter is ignored.

    Returns
    -------
    X : array of shape (n_samples, n_features)
        Document-term matrix.



In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import numpy as np

# Initialize the logistic regression model
lr = LogisticRegression(max_iter=1000, random_state=42)

lr_param_grid = {
    'C': np.logspace(-3, 3, 7),
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l2'],
    'max_iter': [500, 1000, 2000]
}

lr_grid = GridSearchCV(
    estimator = lr,
    param_grid = lr_param_grid,
    cv = 5,
    scoring='f1_weighted',
    n_jobs = -1
)

print("\nRunning Grid Search for Logistic Regression...")
lr_grid.fit(x_train_cv, y_train)

print("✓ Grid Search for Logistic Regression complete!")
print("Best Logistic Parameters: ", lr_grid.best_params_)
print("Best Logistic CV Accuracy: ", lr_grid.best_score_)


Running Grid Search for Logistic Regression...
✓ Grid Search for Logistic Regression complete!
Best Logistic Parameters:  {'C': np.float64(1.0), 'max_iter': 500, 'penalty': 'l2', 'solver': 'liblinear'}
Best Logistic CV Accuracy:  0.6820674008604045


In [None]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the Naive Bayes model
nb = MultinomialNB()

nb_param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
    'fit_prior': [True, False]
}

nb_grid = GridSearchCV(
    estimator = nb,
    param_grid = nb_param_grid,
    cv = 5,
    scoring='f1_weighted',
    n_jobs = -1
)

print("\nRunning Grid Search for Naive Bayes...")
nb_grid.fit(x_train_cv, y_train)

print("✓ Grid Search for Naive Bayes complete!")
print("Best Naive Bayes Parameters: ", nb_grid.best_params_)
print("Best Naive Bayes CV Accuracy: ", nb_grid.best_score_)


Running Grid Search for Naive Bayes...
✓ Grid Search for Naive Bayes complete!
Best Naive Bayes Parameters:  {'alpha': 2.0, 'fit_prior': True}
Best Naive Bayes CV Accuracy:  0.64228864741956


In [20]:
from sklearn.svm import SVC

# Initialize the SVM model
svm = SVC()

svm_param_grid = {
    'C': np.logspace(-2, 2, 7),
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4]
}

svm_grid = GridSearchCV(
    estimator = svm,
    param_grid = svm_param_grid,
    cv = 5,
    scoring='f1_weighted',
    n_jobs = -1
)

print("\nRunning Grid Search for SVM...")
svm_grid.fit(x_train_cv, y_train)

print("✓ Grid Search for SVM complete!")
print("Best SVM Parameters: ", svm_grid.best_params_)
print("Best SVM CV Accuracy: ", svm_grid.best_score_)


Running Grid Search for SVM...
✓ Grid Search for SVM complete!
Best SVM Parameters:  {'C': np.float64(0.21544346900318834), 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}
Best SVM CV Accuracy:  0.6918612399642895
