# Regressor to distinguish Snoop Dog and Beatles texts

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [1]:
import os
import re
 
def convert_text(s):
    # Removes all characters from string except letters and digits and convert letters to lowercase
    return re.sub("[^a-zA-Z0-9]", " ", s.lower())
 
def read_texts(dir_path="./9sem_data/beatles/"):
    # Reads all files from directory
    if dir_path[-1] != os.path.sep:
        dir_path = dir_path + os.path.sep
    txt_list = []
    for file in os.listdir(dir_path):
        file = dir_path + file
        fin = open(file, 'r', encoding='latin1')
        txt = " ".join(fin.readlines())
        txt = convert_text(txt)
        txt_list.append(txt)
    return txt_list

In [10]:
beatles_data = read_texts("./9sem_data/beatles/")
snoop_data = read_texts("./9sem_data/snoop/")

In [16]:
beatles_data = np.array(beatles_data)
snoop_data = np.array(snoop_data)[:74]

## Stemming

In [17]:
from nltk.stem.snowball import SnowballStemmer

In [24]:
# make stemming for texts
ss = SnowballStemmer('english')
stem_beatles = []
stem_snoop = []
for i in range(beatles_data.shape[0]):
    stem_beatles.append(ss.stem(beatles_data[i]))
    stem_snoop.append(ss.stem(snoop_data[i]))

## Create basic dataset

In [25]:
beatles_data = np.array(stem_beatles)
snoop_data = np.array(stem_snoop)

In [27]:
beatles_data.shape

(74,)

In [37]:
X = np.hstack((beatles_data, snoop_data))

In [38]:
X.shape

(148,)

In [39]:
# 1 = beatles, 0 = snoop
y_b = np.full((74, ), 1)
y_s = np.full((74, ), 0)

In [40]:
y = np.hstack((y_b, y_s))

In [42]:
y.shape

(148,)

## Embedding

In [43]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [44]:
count = CountVectorizer(token_pattern=r"\b\w+\b")
result = count.fit_transform(X)
X = result

In [48]:
X.shape

(148, 5023)

In [49]:
tfidf = TfidfTransformer()
X = tfidf.fit_transform(X)

In [50]:
X.shape

(148, 5023)

### Train, CV, Test

In [81]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

In [147]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, shuffle=True)

### LogisticRegression with no regularization

In [148]:
lr = LogisticRegression(C=1e9)

In [149]:
kf = KFold(n_splits=5)
y_pred = np.zeros(y_train.shape)
for train, test in kf.split(X_train):
    lr.fit(X_train[train], y_train[train])
    y_pred[test] = lr.predict(X_train[test])

In [150]:
print('roc-auc = ', roc_auc_score(y_train, y_pred))

roc-auc =  0.957627118644


In [151]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [152]:
print('roc-auc = ', roc_auc_score(y_test, y_pred))

roc-auc =  0.933333333333


### LogisticRegression with L2

In [153]:
lr = LogisticRegression(penalty='l2', C=0.1)
kf = KFold(n_splits=5)
y_pred = np.zeros(y_train.shape)
for train, test in kf.split(X_train):
    lr.fit(X_train[train], y_train[train])
    y_pred[test] = lr.predict(X_train[test])

print('roc-auc = ', roc_auc_score(y_train, y_pred))
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('roc-auc = ', roc_auc_score(y_test, y_pred))

roc-auc =  0.838983050847
roc-auc =  0.966666666667


In [175]:
param_grid = {'C': np.logspace(-6, 2, 10), 'penalty':['l2']} 
gs = GridSearchCV(LogisticRegression(), param_grid=param_grid, scoring='roc_auc') 
gs

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-06,   7.74264e-06,   5.99484e-05,   4.64159e-04,
         3.59381e-03,   2.78256e-02,   2.15443e-01,   1.66810e+00,
         1.29155e+01,   1.00000e+02]), 'penalty': ['l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [176]:
#y_train = y_train.reshape(118, -1)
y_train.shape

(118,)

In [177]:
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-06,   7.74264e-06,   5.99484e-05,   4.64159e-04,
         3.59381e-03,   2.78256e-02,   2.15443e-01,   1.66810e+00,
         1.29155e+01,   1.00000e+02]), 'penalty': ['l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [178]:
gs.best_params_

{'C': 100.0, 'penalty': 'l2'}

In [179]:
bst = gs.best_estimator_
print(bst)
bst.fit(X_train, y_train)
y_pred = bst.predict(X_test)
print('roc-auc = ', roc_auc_score(y_test, y_pred))

LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
roc-auc =  0.933333333333


In [180]:
gs.best_score_

0.99125780553077614

In [181]:
y_pred

array([0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1])