In [None]:
!rm -rf dso-560-nlp-text-analytics && git clone https://github.com/ychennay/dso-560-nlp-text-analytics

Cloning into 'dso-560-nlp-text-analytics'...
remote: Enumerating objects: 3106, done.[K
remote: Counting objects: 100% (296/296), done.[K
remote: Compressing objects: 100% (169/169), done.[K
remote: Total 3106 (delta 211), reused 170 (delta 127), pack-reused 2810[K
Receiving objects: 100% (3106/3106), 92.07 MiB | 16.86 MiB/s, done.
Resolving deltas: 100% (377/377), done.
Checking out files: 100% (3178/3178), done.


In [None]:
%cd dso-560-nlp-text-analytics

/content/dso-560-nlp-text-analytics


# Logistic Regression

## Why Not Just Use A Linear Regression?

### Assumptions for Linear Models:
- Gaussian distribution of residuals (errors)
- Y (target variable) is continuous on the prediction interval
![alt text](https://raw.githubusercontent.com/ychennay/dso-560-nlp-text-analytics/main/images/binary.png "Logo Title Text 1")

## Intro to Algorithmic Marketing (Katsov)
### Finding A Decision Boundary
![alt text](https://raw.githubusercontent.com/ychennay/dso-560-nlp-text-analytics/main/images/lr1.png "Logo Title Text 1")

### Log of Equal Odds 
![alt text](https://raw.githubusercontent.com/ychennay/dso-560-nlp-text-analytics/main/images/lr2.png "Logo Title Text 1")

### Logit Link Function
![alt text](https://raw.githubusercontent.com/ychennay/dso-560-nlp-text-analytics/main/images/lr3.png "Logo Title Text 1")

### Solving for Each Class (Binary Target)
![alt text](https://raw.githubusercontent.com/ychennay/dso-560-nlp-text-analytics/main/images/lr4.png "Logo Title Text 1")

### Log Likelihood
![alt text](https://raw.githubusercontent.com/ychennay/dso-560-nlp-text-analytics/main/images/lr5.png "Logo Title Text 1")

In [None]:
import numpy as np
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [None]:
import pandas as pd
poor = open("datasets/poor_amazon_toy_reviews.txt").readlines()
good = open("datasets/good_amazon_toy_reviews.txt").readlines()

good_reviews = list(map(lambda review: (review, 1), good))
poor_reviews = list(map(lambda review: (review, 0), poor))

all_reviews = good_reviews + poor_reviews
all_reviews_df = pd.DataFrame(all_reviews, columns=["review", "positive"])
all_reviews_df.head()

Unnamed: 0,review,positive
0,Excellent!!!\n,1
1,"""Great quality wooden track (better than some ...",1
2,my daughter loved it and i liked the price and...,1
3,Great item. Pictures pop thru and add detail a...,1
4,I was pleased with the product.\n,1


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 1), 
                             stop_words="english", 
                             max_features=1000,token_pattern='(?u)\\b[a-zA-Z][a-zA-Z]+\\b')

In [None]:
X = vectorizer.fit_transform(all_reviews_df["review"])
y = all_reviews_df["positive"].values
X

<114917x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 926619 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [None]:
y_pred = lr.predict(X)

# calculate accuracy
np.mean(y_pred == y)

from sklearn.metrics import confusion_matrix

confusion_matrix(y, y_pred)

array([[  9087,   3613],
       [  1049, 101168]])

## AUROC (Area Under the Receiver Operator Curve)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y, y_pred)

0.8526246651114863

In [None]:
data = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
data["TARGET"] = y



In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(data)
X_train = train_df.loc[:, ~train_df.columns.isin(['TARGET'])]
X_test = test_df.loc[:, ~test_df.columns.isin(['TARGET'])]


y_train = train_df["TARGET"]
y_test = test_df["TARGET"]

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(86187, 1000)
(86187,)
(28730, 1000)
(28730,)


In [None]:
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [None]:
y_pred = lr.predict(X_test)

np.mean(y_pred == y_test)

0.9578837452140619

## Cross Validation

In [None]:
from sklearn.model_selection import cross_validate
X = data.loc[:, ~data.columns.isin(['TARGET'])]
cv_results = cross_validate(lr, X, y, cv=10,return_train_score=False)

In [None]:
cv_results['test_score']

array([0.9550992 , 0.95475113, 0.95744866, 0.95544727, 0.95475113,
       0.95857988, 0.95466411, 0.95570446, 0.95709686, 0.95561744])