In [89]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# reading in training data

In [39]:
import pandas as pd
import numpy as np
import datetime
from tiingo import TiingoClient

from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression

import indicoio
from textblob import TextBlob
import _pickle as cPickle
import math
import sys

import seaborn as sns
import matplotlib.pyplot as plt

'''
[INPUT]
ticker: 4-5 digit ticker on nyse [STRING]
day: whether prediction is for the next day [BOOLEAN]
week: whether prediction is for the next week [BOOLEAN]

[OUTPUT]
predicted_price: our prediction on tomorrow's price [FLOAT]
'''
def future_runner(ticker):
    today = datetime.date.today()
    testing_df = pipeline_linear(ticker, today, dyn=True)
    model = create_model_linear(ticker)
    prediction = run_model_linear(testing_df, model)
    prediction_formatted = "Our dynamically constructed model predicted ${0} for tomorrow's price.".format(prediction)
    return prediction_formatted

'''
[INPUT]
ticker: 4-5 digit ticker on nyse [STRING]
date: yyyy-mm-dd formatted datestring [STRING]

[OUTPUT]
rating: BUY or SELL [String]
delta: "TICKER went down by AMOUNT." [String]
good_count: amount of good headlines out of 100 [Int]
good_headlines: sampling of five good headlines [List<String>]
bad_count: amount of bad headlines out of 100 [Int]
bad_headlines: sampling of five bad headlines [List<String>]
news_category: GOOD or OKAY or BAD [String]
metadata: full company name, sector, industry [List<String>]
'''
def past_runner(ticker, date):
    df = pipeline_logistic(ticker, date)
    predicted_delta, actual_delta = run_model_logistic(df, "data/logistic.pkl")
    rating, delta = translate_delta(predicted_delta), find_delta(df["Start"][0], df["End"][0], ticker)
    headlines = df["headlines"][0]
    good_headlines, bad_headlines, good_count, bad_count = classify_headlines(headlines)
    news_category = make_category(good_count, bad_count)
    metadata = make_alias(ticker)
    return rating, delta, good_count, good_headlines, bad_count, bad_headlines, news_category, metadata

'''
**************** UTILITY FUNCTIONS ****************
create_model_linear: dynamically constructs linear regression model
weekly_visualization: saves a png figure of sentiment by week as weekly_sentiment.png
make_alias: turns a ticker into its full name, sector, and industry in a list
pretty_print: takes inputs from past_runner, outputs them for testing
printlist: pretty prints a list of strings
find_delta: outputs a formatted string for stock movement
translate_delta: changes label into a buy/sell rating
make_category: transforms good/bad headline counts into a status string
classify_headlines: returns counts and samples for a single date's headlines
single_headlines: processes one headline
impute: performs normalization prior to logistic regression
good_bag/bad_bag: creates bag of words features from word lists and text corpus
aggregate_jsons: turns a response from tiingo client into a corpus
pickle_down: unpickles a model to be run on user query
six_days: calculates end date of a week from a date
remove_time: formats a string in yyyy-mm-dd style
pipeline: transforms a single date and row into observation for feature matrix
multi_row_pipeline: computes entire feature matrix
run_model_logistic: runs logistic regression on test point
run_model_linear: runs logistic regression on tomorrow
output_graph: saves a weekly sentiment graph to "week_sent.png"
****************************************************
'''


def related_tickers(ticker):
    industry = make_alias(ticker)[2]
    name = make_alias(ticker)[0]
    all_ticker_df = pd.read_csv("data/ticker_translate.csv")
    related_metadata = all_ticker_df.loc[(all_ticker_df["Industry"] == industry) & (all_ticker_df["Name"] != name), "Name"].tolist()[0:5]
    return {
        "industry": industry,
        "related_companies": related_metadata
    }


def create_model_linear(ticker):
    dates = pd.read_csv(r"data/dates.csv")["0"].tolist()
    master_df_set = []
    for date in dates:
        row = pipeline_linear(ticker, date, dyn=False)
        master_df_set.append(row)
    training_df = pd.concat(master_df_set)
    X_train = training_df[["indico_sentiment", "sentiment", "sentiment_test", "bad_bag", "good_bag", "lastweek", "absolute_change", "macro_direction"]]
    Y_train = training_df["price"]
    model = LinearRegression().fit(X_train, Y_train)
    return model


def make_alias(ticker):
    tickers = pd.read_csv("data/ticker_translate.csv")
    ticker = ticker.upper()
    data = tickers.loc[tickers["Ticker"] == ticker, ["Name", "Sector", "Industry"]]
    return data.values.tolist()[0]  # name, sector, industry


def pretty_print(a, b, c, d, e, f, g):
    print("RATING: {0} \n{1}".format(a, b), file=sys.stderr)
    print("{0} good headlines:".format(c), file=sys.stderr)
    printlist(d)
    print("{0} bad headlines:".format(e), file=sys.stderr)
    printlist(f)
    print("news rating: {0}".format(g), file=sys.stderr)
    return


def printlist(lis):
    for l in lis:
        print("     " + str(l), file=sys.stderr)


# display functions
def find_delta(start, end, ticker):
    client = TiingoClient({"api_key": "a265fc4a1013923f970d16e7348195074e97fcb0"})
    prices = client.get_ticker_price(ticker, fmt='object', startDate=start, endDate=end, frequency='daily')
    delta = prices[-1].close - prices[0].open
    delta = math.ceil(delta*100)/100
    if delta < 0:
        return "{0} went down by {1}.".format(ticker, delta)
    else:
        return "{0} went up by {1}.".format(ticker, delta)


def translate_delta(delta):
    if delta == 1:
        return "BUY"
    else:
        return "SELL"


# Headline classifier functions
def make_category(goods, bads):
    ratio = goods/bads
    if ratio > 1.25:
        return "GOOD" #green
    elif ratio > 0.75:
        return "OKAY" #yellow
    else:
        return "BAD" #red


def classify_headlines(headline_list):
    bad, good = [], []
    for headline in headline_list:
        ind_polarity = single_headlines(headline)
        if ind_polarity > 0:
            good.append([headline, ind_polarity])
        else:
            bad.append([headline, ind_polarity])
    bad_count, good_count = len(bad), len(good)
    baddest, goodest = sorted(bad, key=lambda x: x[1], reverse=False), sorted(good, key=lambda x: x[1], reverse=True)
    baddest, goodest = [b[0] for b in baddest], [g[0] for g in goodest]
    baddest, goodest = baddest[0:5], goodest[0:5]

    return goodest, baddest, good_count, bad_count


def single_headlines(headline):
    headline = TextBlob(headline)
    p = headline.sentiment.polarity
    return p


# feature functions
def impute(X):
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp = imp.fit(X)
    X = imp.transform(X)
    return X


def good_bag(string_vector):
    bag = pd.read_csv("data/sentiment_word_list.csv")
    good_bag = bag["good"].dropna().apply(str.lower).to_list()
    count = 0
    ttl_length = 0
    string_vector = ' '.join(string_vector)
    for word in good_bag:
        for word2 in string_vector.split():
            ttl_length = ttl_length + 1
            if word.lower() == word2.lower():
                count = count + 1
    return count/ttl_length


def bad_bag(string_vector):
    bag = pd.read_csv("data/sentiment_word_list.csv")
    bad_bag = bag["bad"].apply(str.lower).to_list()
    count = 0
    ttl_length = 0
    string_vector = ' '.join(string_vector)
    for word in bad_bag:
        for word2 in string_vector.split():
            ttl_length = ttl_length + 1
            if word.lower() == word2.lower():
                count = count + 1
    return count/ttl_length


def aggregate_jsons(json_list):
    result = []
    for json in json_list:
        headline = json["title"]
        desc = json["description"]
        x = headline + " " + desc
        result.append(x)
    return result


def pickle_down(filepath):
    with open(filepath, 'rb') as fid:
        model_loaded = cPickle.load(fid)
    return model_loaded


def six_days(start_date):
    return start_date + datetime.timedelta(days=6)


def remove_time(dt):
    return dt[0:10]


def featurize_lidar(start, end):
    lidars = pd.read_csv("data/macros.csv")
    start_lidar = lidars.loc[lidars["Date"] == start, "Macro"].tolist()[0]
    end_lidar = lidars.loc[lidars["Date"] == end, "Macro"].tolist()[0]
    absolute_change = end_lidar - start_lidar
    if absolute_change < 0:
        delt = 0
    else:
        delt = 1
    return absolute_change, delt


def base_pipeline(ticker, date, dynamic=False):
    # api auth
    indicoio.config.api_key = "04878c9a5bb99aaf8a8ccdd65954442a"
    client = TiingoClient({"api_key": "a265fc4a1013923f970d16e7348195074e97fcb0"})

    # fix dates
    start_date = pd.to_datetime(pd.Series([date]), infer_datetime_format=True)
    end_date = start_date.apply(six_days).apply(str)[0][0:10]
    start_date = start_date.apply(str).apply(remove_time)[0]
    
    # add the macro as two features - up down, and absolute change
    absolute_change, macro_direction = featurize_lidar(start_date, end_date)

    # add json
    query_ticker = lambda t, s, e: client.get_news(tickers=[t], startDate=s, endDate=e)
    json = query_ticker(ticker, start_date, end_date)

    # create corpus
    corpus = aggregate_jsons(json)
    vectorized = corpus
    combinatric = lambda l: ''.join(l)
    corpus = combinatric(vectorized)

    # add sentiment
    mean = lambda listy: sum(listy) / len(listy)
    sentiment = (lambda orig: TextBlob(orig).sentiment.polarity)(corpus)
    sentiment_test = mean((lambda orig: [TextBlob(o).sentiment.polarity for o in orig])(vectorized))
    indico_sentiment = (lambda text: indicoio.sentiment_hq(text))(corpus)

    # add bag of words featurization
    badbag = bad_bag(vectorized)
    goodbag = good_bag(vectorized)

    # add last week's data
    start_last = pd.to_datetime(pd.Series([start_date])).apply(
        lambda start_date: start_date - datetime.timedelta(days=6))
    start_last = remove_time(str(start_last.values[0]))
    end_last = start_date
    lastweek = client.get_ticker_price(ticker, fmt='object', startDate=start_last, endDate=end_last, frequency='daily')
    lastweek = lastweek[0].close

    # get prices for return
    open_price, close_price = 0.0, 0.0
    if not dynamic:
        prices = client.get_ticker_price(ticker, fmt='object', startDate=start_date, endDate=end_date, frequency='daily')
        open_price = prices[0].open
        close_price = prices[-1].close

    df = pd.DataFrame(
        {
            "Start": pd.Series([start_date]),
            "End": pd.Series([end_date]),
            "sentiment": pd.Series([sentiment]),
            "sentiment_test": pd.Series([sentiment_test]),
            "indico_sentiment": pd.Series([indico_sentiment]),
            "bad_bag": pd.Series([badbag]),
            "good_bag": pd.Series([goodbag]),
            "lastweek": pd.Series([lastweek]),
            "headlines": pd.Series([vectorized]),
            "absolute_change": pd.Series([absolute_change]),
            "macro_direction": pd.Series([macro_direction])
        }
    )
    return df, open_price, close_price


def pipeline_linear(ticker, date, dyn):
    df, open_price, close_price = base_pipeline(ticker, date, dynamic=dyn)
    df["price"] = pd.Series([close_price])
    return df


def pipeline_logistic(ticker, date):
    df, open_price, close_price = base_pipeline(ticker, date)
    
    # add the delta - up or down
    tri_delt = close_price - open_price
    if tri_delt > 0:
        delta = 1
    else:
        delta = 0

    df["delta"] = pd.Series([delta])
    return df


def multi_row_pipeline(dates, ticker, pipeline_function=pipeline_logistic):
    rows = []
    for date in dates:
        row = pipeline_function(ticker, date)
        rows.append(row)
        print("finished {0} for {1}".format(date, ticker))
    df = pd.concat(rows)
    return df


def run_model_linear(df, model):
    X_test = df[["indico_sentiment", "sentiment", "sentiment_test", "bad_bag", "good_bag", "lastweek", "absolute_change", "macro_direction"]]
    prediction = model.predict(X_test).tolist()[0]
    return prediction


# returns in the form PREDICTED, ACTUAL =====> two values need to be unpacked
def run_model_logistic(df, model_path):
    model = pickle_down(model_path)

    X_test = df[["indico_sentiment", "sentiment", "sentiment_test", "bad_bag", "good_bag", "absolute_change", "macro_direction"]]
    Y_test = df["delta"]

    X_test = impute(X_test)
    scaler = preprocessing.StandardScaler().fit(X_test)
    X_test = scaler.transform(X_test)
    Y_predicted = model.predict(X_test)

    return Y_predicted.tolist(), Y_test.tolist()


def output_graph(ticker, start_date):
    docs, dates, prices = weekly_data_gather(ticker, start_date)
    sentiments = weekly_doc_aggregator(docs)
    df = fix_visualize(dates, prices, sentiments)
    perform_visualization(df, ticker)
    return


def fix_visualize(dates, prices, sentiments):
    df = pd.DataFrame({"dates": dates, "prices": prices, "sent": sentiments})
    df = df.replace(0, pd.np.nan).dropna(axis=0, how='any')
    return df


def perform_visualization(df, ticker):
    dates, sentiments, prices = df["dates"], df["sent"], df["prices"]
    fig = sns.lineplot(dates, sentiments, color='black')
    ax2 = plt.twinx()
    sns.lineplot(dates, prices, ax=ax2, color='green')
    fig.set_title("sentiment of {0} from {1} to {2}".format(ticker, dates[0], dates[len(dates)-1]))
    fig.set_xticklabels(dates)
    for item in fig.get_xticklabels():
        item.set_rotation(60)
    plt.savefig("web/testing.png")
    return fig


def weekly_data_gather(ticker, start_date):
    r, dates, prices = [], [], []
    client = TiingoClient({"api_key": "a265fc4a1013923f970d16e7348195074e97fcb0"})
    dt_obj = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    query_ticker = lambda t, s, e: client.get_news(tickers=[t], startDate=d1, endDate=d2)
    get_price = lambda t, s, e: client.get_ticker_price(ticker, fmt='object',
                                                        startDate=s, endDate=e, frequency='daily')
    for i in range(7):
        d1 = dt_obj + datetime.timedelta(days=i)
        d2 = dt_obj + datetime.timedelta(days=i + 1)
        d1, d2 = str(d1.strftime("%Y-%m-%d")), str(d2.strftime("%Y-%m-%d"))
        json = query_ticker(ticker, d1, d2)
        price = get_price(ticker, d1, d2)
        if not price:
            price = 0
        else:
            price = price[0].open
        docs = [j["description"] for j in json]
        r.append(docs)
        dates.append(d1)
        prices.append(price)
    return r, dates, prices


def weekly_doc_aggregator(documents_by_day):
    sentiments = []
    for day in documents_by_day:
        total_sentiment = 0
        for article in day:
            total_sentiment += TextBlob(article).sentiment.polarity
        sentiments.append(total_sentiment)
    return sentiments

In [40]:
macros = pd.read_csv("data/macros.csv")[["Date", "Macro"]]
macros.head()

Unnamed: 0,Date,Macro
0,2018-12-24,2.50563
1,2018-12-25,2.514005
2,2018-12-26,2.518193
3,2018-12-27,2.52238
4,2018-12-28,2.51988


In [41]:
two_forty = pd.read_csv("data/240_rows.csv")
two_forty.head()

Unnamed: 0.1,Unnamed: 0,Start,End,sentiment,sentiment_test,indico_sentiment,bad_bag,good_bag,lastweek,delta
0,0,2018-12-24,2018-12-30 00:00:00,0.115305,0.0958,0.459462,5e-06,2.3e-05,166.07,1
1,0,2018-12-31,2019-01-06 00:00:00,0.115305,0.0958,0.459462,5e-06,2.3e-05,157.17,0
2,0,2019-01-07,2019-01-13 00:00:00,0.115305,0.0958,0.459462,5e-06,2.3e-05,157.92,1
3,0,2019-01-14,2019-01-20 00:00:00,0.115305,0.0958,0.459462,5e-06,2.3e-05,150.75,1
4,0,2019-01-21,2019-01-27 00:00:00,0.115305,0.0958,0.459462,5e-06,2.3e-05,153.07,1


In [42]:
dates = pd.read_csv("data/dates.csv")
dates.columns = ["Start"]
dates = dates["Start"].tolist()
dates[0:3]

['2018-12-24', '2018-12-31', '2019-01-07']

# running data through logistic pipeline

In [43]:
tickers = ["aapl", "amzn", "fb", "crm", "msft", "baba", "wmt", "nflx", "tsla", "googl", "akam", "acn", 
           "adsk", "bac", "bby", "blk", "ebay", "hpe", "intc", "pypl", "snps", "tmus", "trip", "dis"]

In [45]:
master_df_set = []

In [46]:
for ticker in tickers:
    df = multi_row_pipeline(dates, ticker)
    master_df_set.append(df)
    print("************FINISHED WITH {0}************".format(ticker))

finished 2018-12-24 for aapl
finished 2018-12-31 for aapl
finished 2019-01-07 for aapl
finished 2019-01-14 for aapl
finished 2019-01-21 for aapl
finished 2019-01-28 for aapl
finished 2019-02-04 for aapl
finished 2019-02-11 for aapl
finished 2019-02-18 for aapl
finished 2019-02-25 for aapl
finished 2019-03-04 for aapl
finished 2019-03-11 for aapl
finished 2019-03-18 for aapl
finished 2019-03-25 for aapl
finished 2019-04-01 for aapl
finished 2019-04-08 for aapl
finished 2019-04-15 for aapl
finished 2019-04-22 for aapl
finished 2019-04-29 for aapl
finished 2019-05-06 for aapl
finished 2019-05-13 for aapl
finished 2019-05-20 for aapl
finished 2019-05-27 for aapl
finished 2019-06-03 for aapl
finished 2019-06-10 for aapl
finished 2019-06-17 for aapl
finished 2019-06-24 for aapl
************FINISHED WITH aapl************
finished 2018-12-24 for amzn
finished 2018-12-31 for amzn
finished 2019-01-07 for amzn
finished 2019-01-14 for amzn
finished 2019-01-21 for amzn
finished 2019-01-28 for amzn




finished 2019-01-14 for wmt
finished 2019-01-21 for wmt
finished 2019-01-28 for wmt
finished 2019-02-04 for wmt
finished 2019-02-11 for wmt
finished 2019-02-18 for wmt
finished 2019-02-25 for wmt
finished 2019-03-04 for wmt
finished 2019-03-11 for wmt
finished 2019-03-18 for wmt
finished 2019-03-25 for wmt
finished 2019-04-01 for wmt
finished 2019-04-08 for wmt
finished 2019-04-15 for wmt
finished 2019-04-22 for wmt
finished 2019-04-29 for wmt
finished 2019-05-06 for wmt
finished 2019-05-13 for wmt
finished 2019-05-20 for wmt
finished 2019-05-27 for wmt
finished 2019-06-03 for wmt
finished 2019-06-10 for wmt
finished 2019-06-17 for wmt
finished 2019-06-24 for wmt
************FINISHED WITH wmt************
finished 2018-12-24 for nflx
finished 2018-12-31 for nflx
finished 2019-01-07 for nflx
finished 2019-01-14 for nflx
finished 2019-01-21 for nflx
finished 2019-01-28 for nflx
finished 2019-02-04 for nflx
finished 2019-02-11 for nflx
finished 2019-02-18 for nflx
finished 2019-02-25 for n

finished 2019-01-28 for ebay
finished 2019-02-04 for ebay
finished 2019-02-11 for ebay
finished 2019-02-18 for ebay
finished 2019-02-25 for ebay
finished 2019-03-04 for ebay
finished 2019-03-11 for ebay
finished 2019-03-18 for ebay
finished 2019-03-25 for ebay
finished 2019-04-01 for ebay
finished 2019-04-08 for ebay
finished 2019-04-15 for ebay
finished 2019-04-22 for ebay
finished 2019-04-29 for ebay
finished 2019-05-06 for ebay
finished 2019-05-13 for ebay
finished 2019-05-20 for ebay
finished 2019-05-27 for ebay
finished 2019-06-03 for ebay
finished 2019-06-10 for ebay
finished 2019-06-17 for ebay
finished 2019-06-24 for ebay
************FINISHED WITH ebay************
finished 2018-12-24 for hpe
finished 2018-12-31 for hpe
finished 2019-01-07 for hpe
finished 2019-01-14 for hpe
finished 2019-01-21 for hpe
finished 2019-01-28 for hpe
finished 2019-02-04 for hpe
finished 2019-02-11 for hpe
finished 2019-02-18 for hpe
finished 2019-02-25 for hpe
finished 2019-03-04 for hpe
finished 20

In [47]:
training_mammoth_set = pd.concat(master_df_seter_df_setter_df_set)
training_mammoth_set.head()

Unnamed: 0,Start,End,sentiment,sentiment_test,indico_sentiment,bad_bag,good_bag,lastweek,headlines,absolute_change,macro_direction,delta
0,2018-12-24,2018-12-30,0.161692,0.113839,0.82943,4e-06,3.8e-05,166.07,[Pilgrimage to see Warren Buffett out of step ...,-0.00294,0,1
0,2018-12-31,2019-01-06,0.161692,0.113839,0.82943,4e-06,3.8e-05,157.17,[Pilgrimage to see Warren Buffett out of step ...,0.00844,1,0
0,2019-01-07,2019-01-13,0.161692,0.113839,0.82943,4e-06,3.8e-05,157.92,[Pilgrimage to see Warren Buffett out of step ...,-0.00107,0,1
0,2019-01-14,2019-01-20,0.161692,0.113839,0.82943,4e-06,3.8e-05,150.75,[Pilgrimage to see Warren Buffett out of step ...,0.00219,1,1
0,2019-01-21,2019-01-27,0.161692,0.113839,0.82943,4e-06,3.8e-05,153.07,[Pilgrimage to see Warren Buffett out of step ...,-0.0105,0,1


In [48]:
training_mammoth_set.to_csv("data/576_rows.csv")

# preparing to perform regressions

In [69]:
X = training_mammoth_set[["indico_sentiment", "sentiment", "sentiment_test", "bad_bag", "good_bag", "absolute_change", "macro_direction"]]
Y = training_mammoth_set["delta"]

In [70]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

# random forest [no hyperparameter tuning]

In [80]:
rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)
Y_predicted = rfc.predict(X_test)
error = accuracy_score(Y_test, Y_predicted) * 100
print("initial random forest was constructed with {0}% accuracy".format(error))

initial random forest was constructed with 68.51851851851852% accuracy




# random forest [with tuned hyperparameters]

In [85]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [92]:
rfc = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rfc, 
                               param_distributions = random_grid, 
                               n_iter = 100, cv = 3, 
                               verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, Y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   24.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.2min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [94]:
print(rf_random.best_params_)

{'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 90, 'bootstrap': False}


In [95]:
# new and improved RandomForestClassifier
rfc_tuned = RandomForestClassifier(n_estimators=400,
                                  min_samples_split=10,
                                  min_samples_leaf=2,
                                  max_features="sqrt",
                                  max_depth=90,
                                  bootstrap=False)

In [96]:
rfc_tuned.fit(X_train, Y_train)
Y_predicted = rfc_tuned.predict(X_test)
error = accuracy_score(Y_test, Y_predicted) * 100
print("tuned random forest was constructed with {0}% accuracy".format(error))

tuned random forest was constructed with 70.98765432098766% accuracy


# logistic regression

In [82]:
logreg = LogisticRegression(solver="lbfgs")
logreg.fit(X_train, Y_train)
Y_predicted = logreg.predict(X_test)
error = accuracy_score(Y_test, Y_predicted) * 100
print("initial logistic regression was constructed with {0}% accuracy".format(error))

initial logistic regression was constructed with 66.0493827160494% accuracy


# gaussian naive bayes classifier

In [84]:
gnb = GaussianNB()
gnb.fit(X_train, Y_train)
Y_predicted = gnb.predict(X_test)
error = accuracy_score(Y_test, Y_predicted) * 100
print("initial gnb classifier was constructed with {0}% accuracy".format(error))

initial gnb classifier was constructed with 63.580246913580254% accuracy


# pickling the final random forest model

In [97]:
model = RandomForestClassifier(n_estimators=400,
                                  min_samples_split=10,
                                  min_samples_leaf=2,
                                  max_features="sqrt",
                                  max_depth=90,
                                  bootstrap=False)
model.fit(X, Y)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=90, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [99]:
import pickle
filename = 'data/logistic.pkl'
with open(filename, 'wb') as file:
    pickle.dump(model, file)