In [None]:
import pandas as pd
import numpy as np

reviews_sst = pd.read_csv('preprocessed_reviews_SST.csv')
reviews_sst[:4]

In [None]:
reviews_sst['smry_txt'] = reviews_sst['Summary'].astype(str) + ' ' + reviews_sst['Text']
del reviews

In [None]:
#lets filter the dataset so that it contains reviews that are either positive(4 or 5) or negative(1 or 2)
dataset = dataset[dataset.score != 3]
dataset.shape

In [None]:
dataset['score'] = dataset['score'].apply({1:'negative', 2:'negative', 4:'positive', 5:'positive'}.get)
dataset.head()

## split the training(70%) and testing(30%) data

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(dataset['smry_txt'], dataset['score'])

In [None]:
x_train.shape, y_train.shape

In [None]:
x_test.shape, y_test.shape

# convert our text into vectors

### Countvectorizer

+ It basically convert our each text(smry_txt in this case) into a vector that represents the word and its number of  occurences, if it is present in that text/smry_txt and 0 otherwise.


+ Even if you pass the raw text(Capital letters, special characters...), it will do all the preprocessing for you like removing stopwords, tokenizing etc...

### Example on How Countvectorizer works

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [None]:
# our count vectorizer learns about all the vocabulary of our data 
sample_count_vector = cv.fit(["Hi How are you How are you doing","Hi what's up","Wow that's awesome"])
print('parameters for cv: \n {}'.format(sample_count_vector.get_params()))
print()
print('features of our data : \n {}'.format(sample_count_vector.get_feature_names()))


+ ___Now we have a contVector that has all the features.___


+ ___Let's just convert each of our data into a vector (bag of words model)___
 

In [None]:
sample_x_data = sample_count_vector.transform(["Hi How are you How are you doing","Hi what's up","Wow that's awesome"])
print(type(sample_x_data))

* We got a compressed sparse row matrix representation of the data.
* Let's see the original feature vector

In [None]:
sample_count_vector.get_feature_names() ,sample_x_data.toarray()

###  We can fit and transform the data at the same time..

In [None]:
cv1 = CountVectorizer()
#  It will learn about the data(ie., fit) that is given , and convert each text into a feature vector
# and will return the vectors in Compressed Sparse Row format
data_csr = cv1.fit_transform(["Hi How are you How are you doing","Hi what's up","Wow that's awesome"])
print(cv1.get_feature_names())
print( data_csr.toarray())

# lets do  it with our training data

In [None]:
count_vect = CountVectorizer()

In [None]:
x_train_cv = count_vect.fit_transform(x_train)

In [None]:
# this is how our each review stored in the vector
print(x_train.iloc[0]+"\n")
print(count_vect.inverse_transform(x_train_cv[0]))
print()
print(x_train_cv[0])

## Logistic Regression on the training data

* ###  let's find the best C value for logistic regression (best hyper parameter)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

In [None]:
#   GridsearchCV will build classifier specified (logistic regression),
# with the optimal C value, with 3-fold cross validation by defalut. 
# So, we don't need to build a classifier again with this model..

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
log_reg = LogisticRegression(penalty='l2')
log_reg_grid_clf = GridSearchCV(log_reg, param_grid=param_grid,n_jobs=2, scoring='f1_micro')
log_reg_grid_clf

In [None]:
count_vec_total = CountVectorizer()

In [None]:
x_data = count_vec_total.fit_transform(dataset.smry_txt)

In [None]:
y_data = dataset['score']
y_data.head()

In [None]:
print(dataset.iloc[0]['smry_txt'], end='\n\n')
print(count_vec_total.inverse_transform(x_data[0]), end='\n\n')
print(x_data[0])

In [None]:
#lets run GridsearchCV on our data
log_reg_grid_clf.fit(x_data, y_data)