# Capstone

### Introduction etc etc

***
***

# Imports, Loading Data

In [1]:
#imports
import os
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import datetime
import re
import string
import seaborn as sns
import nltk
import cv2
from scipy import spatial
from nltk import word_tokenize, FreqDist
from nltk.collocations import *
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from gensim.models import Word2Vec, word2vec
from mpl_toolkits.axes_grid1 import ImageGrid
from wordcloud import WordCloud

#models:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

import tensorflow as tf
from keras.optimizers import SGD
from tensorflow.keras import preprocessing
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence

from keras_preprocessing.sequence import pad_sequences


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#seed for reproducible results
np.random.seed(42)
tf.random.set_seed(42)

##### explain datset in detail

In [2]:
data_dir_depression = './data/depression_dataset_reddit_cleaned.csv'

depression_df = pd.read_csv(data_dir_depression)

# EDA and Preprocessing

## Exploratory Data Analysis

In [3]:
#change to df for ease of use
df = depression_df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7731 entries, 0 to 7730
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_text     7731 non-null   object
 1   is_depression  7731 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 120.9+ KB


In [4]:
df.head()

Unnamed: 0,clean_text,is_depression
0,we understand that most people who reply immed...,1
1,welcome to r depression s check in post a plac...,1
2,anyone else instead of sleeping more when depr...,1
3,i ve kind of stuffed around a lot in my life d...,1
4,sleep is my greatest and most comforting escap...,1


In [5]:
df.isna().sum()

clean_text       0
is_depression    0
dtype: int64

In [6]:
df['is_depression'].value_counts()

0    3900
1    3831
Name: is_depression, dtype: int64

##### ______ discuss balance_____________________

In [7]:
df['islowercase'] = list(map(lambda x: x.islower(), df['clean_text']))
df['islowercase'].value_counts()

True    7731
Name: islowercase, dtype: int64

## Preprocessing Data

Explain __________________________________________________________________________________________

will create variations of columns to be able to experiment with different options (with/without stopwords, etc)

### Lowercase Text

In this case, the text data from the downloaded dataset is already lowercase, so we do not need to take this step!

In [8]:
# df['lower_text'] = df['clean_text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# df['lower_text'].head()

### Tokenize
Explain __________________________________________________________________________________________

In [9]:
def tokenize_data(data):
    
    #(this regex code keeps words with apostrophes as single tokens)
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    
    #create tokens
    tokens = nltk.regexp_tokenize(data, pattern)
    return [word for word in tokens]

In [10]:
df['tokenized'] = df.apply(lambda x: tokenize_data(x['clean_text']), axis=1)
df['tokenized'].head()

0    [we, understand, that, most, people, who, repl...
1    [welcome, to, r, depression, s, check, in, pos...
2    [anyone, else, instead, of, sleeping, more, wh...
3    [i, ve, kind, of, stuffed, around, a, lot, in,...
4    [sleep, is, my, greatest, and, most, comfortin...
Name: tokenized, dtype: object

### Stopwords
Explain __________________________________________________________________________________________

In [None]:
#show top FreqDist of Tokenized Column
FreqDist(sum(df['tokenized'], [])).most_common(30)

In [None]:
stops = stopwords.words('english')

#add punctuation
for symbol in ",.'?!()":
    stops += symbol

    #show example of stopwords list
print(stops)

In [None]:
def tokenize_remove_stops(data):
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    tokens = nltk.regexp_tokenize(data, pattern)
    return [word for word in tokens if word not in stops]

In [None]:
df['tokenized_no_stops'] = df.apply(lambda x: tokenize_remove_stops(x['clean_text']), axis=1)

#show difference
df[['tokenized','tokenized_no_stops']].head()

In [None]:
#show top FreqDist now that stopwords are removed!
freq_minus_stops = FreqDist(sum(df['tokenized_no_stops'], []))
freq_minus_stops.most_common(30)

##### explain wa, etc __________________________________________________________

In [None]:
stops.extend(['wa'])
df['tokenized_no_stops'] = df.apply(lambda x: tokenize_remove_stops(x['clean_text']), axis=1)

freq_minus_stops = FreqDist(sum(df['tokenized_no_stops'], []))
freq_minus_stops.most_common(10)

##### show size diff _______________________________________________________

### Lemmatization
explain __________________________________________________________________________________________

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_data(data):
    return [lemmatizer.lemmatize(word) for word in data]

In [None]:
df['lemmatized_no_stops'] = df['tokenized_no_stops'].apply(lemmatize_data)
df['lemmatized_no_stops'].head()

#### Rejoining Words
final step for later use, explain

In [None]:
def rejoin_words(column):
    return ( " ".join(column))

df['rejoined']= df.apply(lambda x: rejoin_words(x['lemmatized_no_stops']), axis=1)
df['rejoined']

### The Peprocessed Dataset
##### explain _______________________________________

In [None]:
#show df minus the columns we dont need for this analysis
df[['clean_text', 'rejoined']].head()

#### Check for Significant Words in Corpus

specifically words that (to a human observer) may be dead giveaways for depression

In [None]:
depressed_df = df[df['is_depression']==1]
undiag_df = df[df['is_depression']==0]

depressed_top = FreqDist(sum(depressed_df['tokenized_no_stops'], [])).most_common(50)
undiag_top = FreqDist(sum(undiag_df['tokenized_no_stops'], [])).most_common(50)

depressed_top = pd.Series(dict(depressed_top))
undiag_top = pd.Series(dict(undiag_top))

In [None]:
depressed_top

In [None]:
undiag_top

In [None]:
depressed_top.index

In [None]:
for x in depressed_top.index:
    if x in undiag_top.index:
        depressed_top.pop(x)
depressed_top

##### PLOT ________________________________________

In [None]:
fig = plt.figure(figsize =(10, 10))
plt.gca().invert_yaxis()
plt.barh(depressed_top.index, depressed_top.values)
 
# Show Plot
plt.show()

##### explanation _______________________________________________________

In [None]:
#word cloud:
rejoined = depressed_df.apply(lambda x: rejoin_words(x['tokenized_no_stops']), axis=1)
words = ' '.join([content for content in rejoined])

wordcloud = WordCloud(width=750, height=500, random_state=42, max_font_size=200, background_color='#401a24', colormap='Set2', collocations=False).generate(words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

imgName= 'cloud-Depression'
wordcloud.to_file('./img/'+imgName+'.png')

In [None]:
#word cloud:
rejoined = undiag_df.apply(lambda x: rejoin_words(x['tokenized_no_stops']), axis=1)
words = ' '.join([content for content in rejoined])

wordcloud = WordCloud(width=750, height=500, random_state=42, max_font_size=200, background_color='green', colormap='Set3', collocations=False).generate(words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

imgName= 'cloud-Undiag'
wordcloud.to_file('./img/'+imgName+'.png')

##### ________________Explain

# Modeling

##### __EXPLAIN STARTING FEATURE CHOICE, TARGET, ETC__
## ____

what "simple data" means

also talk about data sizes

starting original

### Splitting Data

In [None]:
X = df['clean_text']
y = df['is_depression']

#train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

### explain vectorizer

discuss max features

In [None]:
#max features 100 as a starting point
vectorizer = TfidfVectorizer(max_features=100)

X_train_vectorized = vectorizer.fit_transform(X_train)

# Visually inspect the vectorized data
pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=vectorizer.get_feature_names())

### Metrics and Scoring
explain metric options

explain all in this context, but focus on acc, record other scores as well

ideal scores

In [None]:
#create score df
score_df = pd.DataFrame(columns = ['Name','Accuracy', 'Recall', 'Precision', 'F1'])

In [None]:
def update_score_df(name):
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 =f1_score(y_test, y_pred)
    
    #create new df entry
    entry = {'Name': name, 'Accuracy': acc, 'Recall': recall, 'Precision': prec, 'F1':f1}
    #print('adding: ',entry)
    return entry

## Baseline Model
explain model choice


In [None]:
#model name for score keeping
model_name = 'Baseline - MultiNB'

nb_model = MultinomialNB()

baseline_cv = cross_val_score(nb_model, X_train_vectorized, y_train)
baseline_cv

##### explain cross val score, why its so high _______________________________________

In [None]:
#reminder that true values are balanced evenly:
y_train.value_counts(normalize=True)

### Evaluate Model on Test Data

In [None]:
#fit model to train data
nb_model.fit(X_train_vectorized, y_train)

#vectorize test data
vectorizer = TfidfVectorizer(max_features=100)
X_test_vectorized = vectorizer.fit_transform(X_test)

#predict on test data
y_pred = nb_model.predict(X_test_vectorized)

In [None]:
# create confusion matrix helper function:
def plot_confusion(name):

    #confusion matrix for test data
    conf_matrix = confusion_matrix(y_test,y_pred)

    # plot confusion matrix values
    sns.heatmap(
    conf_matrix, 
    annot=True, 
    cmap='Blues', 
    fmt='0.5g'
    )

    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(name)

    plt.savefig('./img/'+name+'.png')
    plt.show()

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
plot_confusion(model_name)

In [None]:
score_df = score_df.append(update_score_df(model_name), ignore_index=True)
score_df

##### explanation of results _____________________________________________________



##### coefficients etc ____________________________

In [None]:
coefficient_df = pd.DataFrame()
coefficient_df['feature'] = vectorizer.get_feature_names()
coefficient_df['coefficients'] = nb_model.coef_.T

#sort in descending order, to see which coefficient most affects the positive class prediction
coefficient_df.sort_values(by='coefficients', ascending=False)

##### further detail about problematic "giveaway" words _______________________________________________________________________

look at again after stop words out of the way

##### explain min_df and max_df as well _________________________________________________

### Baseline Model with Preprocessed Data

explain

##### Rejoin Data _____________________________explain why_________________________________________________

for tfidf vectorizer

In [None]:
#model name for score keeping
model_name = 'Baseline No Stops'

X = df['rejoined']
y = df['is_depression']

#train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)



##### explain pipeline ________________________________________________________

vectorizes as part of process

In [None]:
#create pipeline
pipe = Pipeline([('vectorizer', TfidfVectorizer(max_features=100)),
                          ('clf', MultinomialNB())
                         ])

#fit to data
pipe.fit(X_train, y_train)

In [None]:
#predict on test data
y_pred = pipe.predict(X_test)

print(classification_report(y_test, y_pred))

plot_confusion(model_name)

In [None]:
score_df = score_df.append(update_score_df(model_name), ignore_index=True)
score_df

##### explain _________________________________---

identify how removal of stop words increased model performance

##### One more look at coefficients ______________________________________________________________

In [None]:
coefficient_df = pd.DataFrame()
coefficient_df['feature'] = pipe['vectorizer'].get_feature_names()
coefficient_df['coefficients'] = pipe['clf'].coef_.T

#sort in descending order, to see which coefficient most affects the positive class prediction
coefficient_df.sort_values(by='coefficients', ascending=False)[:20]

### Attempt One More Time with Subjective Alterations

go over subjective nature, etc

##### ______________________________________________

In [None]:
#same as stopword removal but our own list
def remove_some_words(data):
    words = ['depression', 'feel', 'anxiety']
    return [word for word in data if word not in words]

In [None]:
#remove words from lemmatized dataset without stopwords
df['subjective_removal'] = df.apply(lambda x: remove_some_words(x['lemmatized_no_stops']), axis=1)

#rejoin again for vectorizer!
df['rejoined_post_removal']= df.apply(lambda x: rejoin_words(x['subjective_removal']), axis=1)

In [None]:
#model name for score keeping
model_name = 'Baseline Adjusted Data'

#repeat pipeline with new data
X = df['rejoined_post_removal']
y = df['is_depression']

#train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

#create pipeline
pipe = Pipeline([('vectorizer', TfidfVectorizer(max_features=100)),
                          ('clf', MultinomialNB())
                         ])

#fit to data
pipe.fit(X_train, y_train)

#predict on test data
y_pred = pipe.predict(X_test)

print(classification_report(y_test, y_pred))

plot_confusion(model_name)

In [None]:
score_df = score_df.append(update_score_df(model_name), ignore_index=True)
score_df

comment on scoring, slightly less accuract, etc 
##### __________

In [None]:
coefficient_df = pd.DataFrame()
coefficient_df['feature'] = pipe['vectorizer'].get_feature_names()
coefficient_df['coefficients'] = pipe['clf'].coef_.T

#sort in descending order, to see which coefficient most affects the positive class prediction
coefficient_df.sort_values(by='coefficients', ascending=False)[:20]

##### commentary __________________________________________

## Model 2 - MultinomialNB (Tuned)


##### explain gridsearch

##### explain parameters

In [None]:
#example with tfidf:

#TfidfVectorizer:
tfidf_params = {
    'vectorizer__max_df': (0.25, 0.5, 0.75, 1.0),
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vectorizer__max_features': (None, 50, 100, 200, 300)
}

##### explain

In [None]:
#model name for score keeping
model_name = 'Tuned MNB'

#parameters:
model_params = {
    'clf__alpha': (1e-3, 1e-2, 1e-1, 1)
}

#create pipeline
pipe = Pipeline([('vectorizer', TfidfVectorizer()),
                          ('clf', MultinomialNB())])

############################################################################################

params = [tfidf_params, model_params]

#gridsearch
gscv = GridSearchCV(pipe, params, cv = 5, n_jobs=-1, verbose=True, scoring = 'accuracy')
gscv.fit(X_train, y_train)

In [None]:
#see what hyperparameters were selected for best score
gscv.best_params_

In [None]:
#predict on test data
y_pred = gscv.predict(X_test)

print(classification_report(y_test, y_pred))

plot_confusion(model_name)

In [None]:
score_df = score_df.append(update_score_df(model_name), ignore_index=True)
score_df

##### talk about results _________________________________________

## Model 3 - Logistic Regression

In [None]:
#model name for score keeping
model_name = 'LogReg'

#parameters:
model_params = {
    'clf__C': np.logspace(-2, 2, 10),
    'clf__penalty': ['l1', 'l2']
}

#create pipeline
pipe = Pipeline([('vectorizer', TfidfVectorizer()),
                          ('clf', LogisticRegression(random_state=42))])

############################################################################################

params = [tfidf_params, model_params]

#gridsearch
gscv = GridSearchCV(pipe, params, cv = 5, n_jobs=-1, verbose=True, scoring = 'accuracy')
gscv.fit(X_train, y_train)

In [None]:
#see what hyperparameters were selected for best score
gscv.best_params_

In [None]:
#predict on test data
y_pred = gscv.predict(X_test)

print(classification_report(y_test, y_pred))

plot_confusion(model_name)

In [None]:
score_df = score_df.append(update_score_df(model_name), ignore_index=True)
score_df

##### talk about results _________________________________________

## Model 4 - Decision Tree

In [None]:
#model name for score keeping
model_name = 'Decision Tree'

#parameters:
model_params = {
    'clf__max_depth': [2, 3, 5, 10, 20],
    'clf__min_samples_leaf': [5, 10, 20, 50, 100],
    'clf__criterion': ["gini", "entropy"]
}

#create pipeline
pipe = Pipeline([('vectorizer', TfidfVectorizer()),
                          ('clf', DecisionTreeClassifier(random_state=42))])

############################################################################################

params = [tfidf_params, model_params]

#gridsearch
gscv = GridSearchCV(pipe, params, cv = 5, n_jobs=-1, verbose=True, scoring = 'accuracy')
gscv.fit(X_train, y_train)

In [None]:
#see what hyperparameters were selected for best score
gscv.best_params_

In [None]:
#predict on test data
y_pred = gscv.predict(X_test)

print(classification_report(y_test, y_pred))

plot_confusion(model_name)

In [None]:
score_df = score_df.append(update_score_df(model_name), ignore_index=True)
score_df

##### talk about results _________________________________________

## Model 5 - Random Forest

In [None]:
#model name for score keeping
model_name = 'Random Forest'

#parameters:
model_params = {
    'clf__n_estimators': [10,100,1000],
    'clf__max_depth': [None, 3, 4, 10],
    'clf__max_features': [2, 5, 10]
    
}

#create pipeline
pipe = Pipeline([('vectorizer', TfidfVectorizer()),
                          ('clf', RandomForestClassifier(random_state=42))])

############################################################################################

params = [tfidf_params, model_params]

#gridsearch
gscv = GridSearchCV(pipe, params, cv = 5, n_jobs=-1, verbose=True, scoring = 'accuracy')
gscv.fit(X_train, y_train)

In [None]:
#see what hyperparameters were selected for best score
gscv.best_params_

In [None]:
#predict on test data
y_pred = gscv.predict(X_test)

print(classification_report(y_test, y_pred))

plot_confusion(model_name)

In [None]:
score_df = score_df.append(update_score_df(model_name), ignore_index=True)
score_df

##### talk about results _________________________________________

## Model 6 - Support Vector Classification

In [None]:
#model name for score keeping
model_name = 'SVC'

#create pipeline
pipe = Pipeline([('vectorizer', TfidfVectorizer()),
                          ('clf', SVC())])

############################################################################################

params = [tfidf_params]

#gridsearch
gscv = GridSearchCV(pipe, params, cv = 5, n_jobs=-1, verbose=True, scoring = 'accuracy')
gscv.fit(X_train, y_train)

In [None]:
#predict on test data
y_pred = gscv.predict(X_test)

print(classification_report(y_test, y_pred))

plot_confusion(model_name)

In [None]:
score_df = score_df.append(update_score_df(model_name), ignore_index=True)
score_df

##### talk about results _________________________________________

## Strongest Model Analysis

-look at score_df

-also look at conf matrices in a graph

In [None]:
#compare models all using same dataset:
score_df.drop([0,1], axis=0).sort_values(by=['Accuracy'], ascending=False).style.hide_index()

In [None]:
# show top 3 confusion:


# create figure
fig = plt.figure(figsize=(25, 10))
  
# setting values to rows and column variables
rows = 1
columns = 3
  
# reading images
Image1 = cv2.imread('./img/LogReg.png')
RGB_img1 = cv2.cvtColor(Image1, cv2.COLOR_BGR2RGB)
Image2 = cv2.imread('./img/SVC.png')
RGB_img2 = cv2.cvtColor(Image2, cv2.COLOR_BGR2RGB)
Image3 = cv2.imread('./img/Random Forest.png')
RGB_img3 = cv2.cvtColor(Image3, cv2.COLOR_BGR2RGB)
  
# Adds a subplot at the 1st position
fig.add_subplot(rows, columns, 1)
  
# showing image
plt.imshow(RGB_img1)
plt.axis('off')
plt.title("Acc: 91.5% | F1: 91.2%")
  
# Adds a subplot at the 2nd position
fig.add_subplot(rows, columns, 2)
  
# showing image
plt.imshow(RGB_img2)
plt.axis('off')
plt.title("Acc: 91.2% | F1: 90.6%")
  
# Adds a subplot at the 3rd position
fig.add_subplot(rows, columns, 3)
  
# showing image
plt.imshow(RGB_img3)
plt.axis('off')
plt.title("Acc: 87.7% | F1: 87.3%")

##### splain _______________________________

In [None]:
X = df['rejoined_post_removal']
y = df['is_depression']
#train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

#remake LogReg model with best parameters from earlier gridsearch
pipe = Pipeline([('vectorizer', TfidfVectorizer(max_features=100)),
                          ('clf', LogisticRegression(C =4.6415888336127775, penalty = 'l2'))
                         ])

#fit to data
pipe.fit(X_train, y_train)

In [None]:
coefficient_df = pd.DataFrame()
coefficient_df['feature'] = pipe['vectorizer'].get_feature_names()
coefficient_df['coefficients'] = pipe['clf'].coef_.T

#sort in descending order, to see which coefficient most affects the positive class prediction
coefficient_df.sort_values(by='coefficients', ascending=False)[:15]

In [None]:
coefs = coefficient_df.sort_values(by='coefficients', ascending=False)[:20]

fig = plt.figure(figsize =(10, 10))
plt.gca().invert_yaxis()
plt.barh(coefs['feature'], coefs['coefficients'])
plt.title('Highest Coefficients of Strongest Model')

# Show Plot
plt.show()

figname='coef-FinalModel'
plt.savefig('./img/'+figname+'.png')

##### commentary __________________________________________

##### WITH PROBLEMATIC WORDS: _____________________

In [None]:
X = df['rejoined']
y = df['is_depression']
#train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

#remake LogReg model with best parameters from earlier gridsearch
pipe = Pipeline([('vectorizer', TfidfVectorizer(max_features=100)),
                          ('clf', LogisticRegression(C =4.6415888336127775, penalty = 'l2'))
                         ])

#fit to data
pipe.fit(X_train, y_train)

In [None]:
coefs

In [None]:
coefficient_df = pd.DataFrame()
coefficient_df['feature'] = pipe['vectorizer'].get_feature_names()
coefficient_df['coefficients'] = pipe['clf'].coef_.T

coefs = coefficient_df.sort_values(by='coefficients', ascending=False)[:20]

###########################################################
problem_colors = ['#1f77b4' if (x < 6) else 'red' for x in coefs['coefficients'] ]

fig = plt.figure(figsize =(10, 10))
plt.gca().invert_yaxis()
plt.barh(coefs['feature'], coefs['coefficients'], color = problem_colors)
plt.title('Highest Coefficients of Strongest Model \n(With Problematic Features)')

# Show Plot
plt.show()

figname='coef-FinalModel-withProblematic'
plt.savefig('./img/'+figname+'.png')

##### commentary __________________________________________

In [None]:
#discuss findings, next steps

## _______________________________________ bigrams, trigrams with prominence

In [None]:
all_words = (sum(df['tokenized']))

bigram_measures = nltk.collocations.BigramAssocMeasures()

bi_finder = BigramCollocationFinder.from_words(depressed_df)

bi_scored = bi_finder.score_ngrams(bigram_measures.raw_freq)

bi_scored[:5]

In [None]:
#pointwise mutual information finder

In [None]:
pmi_finder = BigramCollocationFinder.from_words(depressed_d'])
pmi_finder.apply_freq_filter(5)
pmi_scored = pmi_finder.score_ngrams(bigram_measures.pmi)

pmi_scored

##### _________________________________________________-

explain need for word embeddings

# Word Embeddings

##### explain _________________________________________________________

## Word2Vec

#### window size ________________________________
explain similarity to n grams from earlier

##### ______________________________________________________

explain why we are using full dataset (with 'depression' etc)

In [None]:
model_name='W2V Model'

w2v_X=df['tokenized']
w2v_model = Word2Vec(w2v_X, vector_size=100, window=5, min_count=2)

w2v_model.wv.index_to_key[:20]

In [None]:
w2v_model.wv.most_similar('depression')

In [None]:
w2v_model.wv.most_similar('anxiety')

##### explain why we will use pre trained


## GLOVE, PRETRAINED, EXPLAIN ___________________________________

__using both 50-dimensonal and 100-dimensional__

In [None]:
#get total vocabulary of our data so that we only take words we need:

#tokenized data with all words
X=df['tokenized']
y=df['is_depression']

#split current data for modeling
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [None]:
#get total vocabulary of entire training 
total_vocabulary = set(word for entry in X for word in entry)

#show total vocab:
print('There are {} unique tokens in the dataset.'.format(len(total_vocabulary)))

In [None]:
#search glove.6B.50d data for our own words:
glove50 = {}
with open('./data/glove.6B.50d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in total_vocabulary:
            vector = np.array(parts[1:], dtype=np.float32)
            glove50[word] = vector
            
#same with 100d data
glove100 = {}
with open('./data/glove.6B.100d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in total_vocabulary:
            vector = np.array(parts[1:], dtype=np.float32)
            glove100[word] = vector

In [None]:
glove50['depression']

In [None]:
glove100['depression']

In [None]:
#example of glove pre-trained model's similar words:

def find_closest_embeddings(embedding):
    return sorted(glove50.keys(), key=lambda word: spatial.distance.euclidean(glove50[word], embedding))

find_closest_embeddings(glove50["depression"])[:10]

##### explain need for mean word embeddings _______________________________________________________________--

In [None]:
# Creating Mean Word Embeddings
# For this step, it's worth the extra effort to write your own mean embedding vectorizer class, so that you can make use of pipelines from scikit-learn. Using pipelines will save us time and make the code a bit cleaner.

# The code for a mean embedding vectorizer class is included below, with comments explaining what each step is doing. Take a minute to examine it and try to understand what the code is doing.y

In [None]:
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        # Takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove50))])
    
    # Note: Even though it doesn't do anything, it's required that this object implement a fit method or else
    # it can't be used in a scikit-learn pipeline  
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

### Glove Model 1 - LogReg

In [None]:
#model name for score keeping
model_name = 'Glove-50 LogReg'

#remake LogReg model with best parameters from earlier gridsearch
pipe = Pipeline([('vectorizer', W2vVectorizer(glove50)),
                          ('clf', LogisticRegression(C =4.6415888336127775, penalty = 'l2', max_iter=1000))
                         ])

#fit to data
pipe.fit(X_train, y_train)

In [None]:
#predict on test data
y_pred = pipe.predict(X_test)

print(classification_report(y_test, y_pred))

plot_confusion(model_name)

### Selecting a More Fitting Model _________________________________________________

In [None]:
rf =  Pipeline([('Word2Vec Vectorizer', W2vVectorizer(glove50)),
              ('Random Forest', RandomForestClassifier(n_estimators=100, verbose=True))])
svc = Pipeline([('Word2Vec Vectorizer', W2vVectorizer(glove50)),
                ('Support Vector Machine', SVC())])
lr = Pipeline([('Word2Vec Vectorizer', W2vVectorizer(glove50)),
              ('Logistic Regression', LogisticRegression())])

models = [('Random Forest', rf),
          ('Support Vector Machine', svc),
          ('Logistic Regression', lr)]

scores = [(name, cross_val_score(model, X_train, y_train, cv=2).mean()) for name, model, in models]

In [None]:
scores

### Glove Model 2 - Random Forest

In [None]:
#gridsearch on best model:

#model name for score keeping
model_name = 'Glove-50 Random Forest'


params = {
    'clf__n_estimators': [10,100,1000],
    'clf__max_depth': [None, 3, 4, 10],
    'clf__max_features': [2, 5, 10]
    
}

#remake LogReg model with best parameters from earlier gridsearch
pipe = Pipeline([('vectorizer', W2vVectorizer(glove50)),
                          ('clf', RandomForestClassifier(random_state=42))
                         ])

#gridsearch
gscv = GridSearchCV(pipe, params, cv = 5, n_jobs=-1, verbose=True, scoring = 'accuracy')
gscv.fit(X_train, y_train)


#fit to data
pipe.fit(X_train, y_train)

In [None]:
#predict on test data
y_pred = pipe.predict(X_test)

print(classification_report(y_test, y_pred))

plot_confusion(model_name)

In [None]:
score_df = score_df.append(update_score_df(model_name), ignore_index=True)
score_df

##### explain ________________________________

In [None]:
#gridsearch on best model:

#model name for score keeping
model_name = 'Glove-100 Random Forest'


params = {
    'clf__n_estimators': [10,100,1000],
    'clf__max_depth': [None, 3, 4, 10],
    'clf__max_features': [2, 5, 10]
    
}

#remake LogReg model with best parameters from earlier gridsearch
pipe = Pipeline([('vectorizer', W2vVectorizer(glove100)),
                          ('clf', RandomForestClassifier(random_state=42))
                         ])

#gridsearch
gscv = GridSearchCV(pipe, params, cv = 5, n_jobs=-1, verbose=True, scoring = 'accuracy')
gscv.fit(X_train, y_train)


#fit to data
pipe.fit(X_train, y_train)

In [None]:
#predict on test data
y_pred = pipe.predict(X_test)

print(classification_report(y_test, y_pred))

plot_confusion(model_name)

In [None]:
score_df = score_df.append(update_score_df(model_name), ignore_index=True)
score_df

##### explain ________________________________

##### coefficients ________________________________________

###### again without 'depression' etc???

***
***

# Conclusion


remember to include detailed info in into, conclusion, readme about all models and methods used
##### __________________________________________________________________________________