# Capstone Project
*Author: Samuel Leadley*
## Preprocessing, Modeling, and Evaluation

In [1]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import datetime
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import regex as re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix

%matplotlib inline

In [2]:
import warnings 
warnings.simplefilter('ignore')

## Load Data

In [4]:
shareholder_letters = pd.read_csv('../datasets/clean_df.csv')
shareholder_letters.drop('Unnamed: 0', axis=1, inplace=True)
shareholder_letters.head()

Unnamed: 0,company,ticker,sector,year,letter_to_shareholder,net_income,target
0,Goldman Sachs,GS,Financials,1999,this is our first letter to shareholders inclu...,2.708,1.0
1,Goldman Sachs,GS,Financials,2000,was a remarkable year for goldman sachs and a...,3.067,1.0
2,Goldman Sachs,GS,Financials,2001,it is impossible to discuss without beginning...,2.31,0.0
3,Goldman Sachs,GS,Financials,2002,it was a challenging year for goldman sachs th...,2.114,0.0
4,Goldman Sachs,GS,Financials,2003,looking back on we take pride in our performa...,3.005,1.0


## Preprocessing
### Lemmatizing Words
It was evident from the EDA that similar words were frequently used like year and years or business and businesses. To reduce the number of features and help improve my model I decided to lemmatize each word to its closest root word.

In [5]:
# Instantiate tokenizer and lemmatizer
tokenizer = RegexpTokenizer(r'\w+') 
lemmatizer = WordNetLemmatizer()

In [6]:
# Tokenizing and lemmatizing the letters
lemm_letters = []

for i in shareholder_letters['letter_to_shareholder']:
    token_list = tokenizer.tokenize(i)
    i = [lemmatizer.lemmatize(i) for i in token_list]
    
    lemm_letters.append(' '.join(i))

In [7]:
# Created another data frame of the lemmatized titles
lemm_letters_df = pd.DataFrame(lemm_letters)

In [8]:
shareholder_letters['letter_to_shareholder'] = lemm_letters_df[[0]]

### Create Variables and Train-Test-Split

In [9]:
# checing the balance of classes
shareholder_letters['target'].value_counts(normalize=True)

1.0    0.692771
0.0    0.307229
Name: target, dtype: float64

The above percentage for my positive class also is the baseline score for my models.

In [13]:
# creating variables and train-test-splitting
X = shareholder_letters["letter_to_shareholder"]
y = shareholder_letters["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=26, stratify=y)

### Create Custom Stop Words

TO DO.

## Modeling
### Logistic Regression

In [14]:
# define a function that will run fit a model and return its accuracy score 
def pipe_searcher(pipe, params):
    gs = GridSearchCV(estimator=pipe, param_grid=params, cv=3, scoring="accuracy")
    gs.fit(X_train, y_train)
    print(f'CrossVal Score: {gs.best_score_}')
    print(f'Training Score: {gs.score(X_train, y_train)}')
    print(f'Testing Score: {gs.score(X_test, y_test)}')
    print(gs.best_params_)
    return gs

In [15]:
# instantiate pipe for logistic regression and TfIdf
lr_pipe = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('lr', LogisticRegression())])
lr_params = {
    'tfidf__max_features': [100_000, None],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__max_df': [0.5, 0.8],
    'lr__C' : [0.001, 0.01, 0.02],
    'lr__penalty': ['l1', 'l2']
}

In [16]:
lr_model = pipe_searcher(lr_pipe, lr_params)

CrossVal Score: 0.6935483870967742
Training Score: 0.6935483870967742
Testing Score: 0.6904761904761905
{'lr__C': 0.001, 'lr__penalty': 'l2', 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__ngram_range': (1, 1)}


### Decision Tree

In [17]:
# instantiate pipe for decision tree classifier and TdIdf
dt_pipe = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('dt', DecisionTreeClassifier(random_state=26))])
dt_params = {
    'tfidf__max_features': [100_000, None],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'dt__min_samples_leaf' : [1, 2],
    'dt__max_depth': [500, None]
}

In [18]:
dt_model = pipe_searcher(dt_pipe, dt_params)

CrossVal Score: 0.6370967741935484
Training Score: 1.0
Testing Score: 0.5476190476190477
{'dt__max_depth': 500, 'dt__min_samples_leaf': 1, 'tfidf__max_features': 100000, 'tfidf__ngram_range': (1, 1)}


### Random Forest

In [19]:
# Instantiate a pipe for CountVectorizer and random forest model
rf_pipe = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('rf', RandomForestClassifier(random_state=26))])
rf_params = {
    'tfidf__max_features': [50_000, 100_000, None],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'rf__n_estimators' : [30, 35],
    'rf__min_samples_leaf': [8, 10],
    'rf__max_depth' :[None, 500]
}

In [20]:
rf_model = pipe_searcher(rf_pipe, rf_params)

CrossVal Score: 0.6935483870967742
Training Score: 0.7580645161290323
Testing Score: 0.6904761904761905
{'rf__max_depth': None, 'rf__min_samples_leaf': 8, 'rf__n_estimators': 30, 'tfidf__max_features': 50000, 'tfidf__ngram_range': (1, 1)}
