# Training a model

**The purpose of this notebook is to load a training set for a campaign section from its PostgreSQL database, train a model, and store the model for future use.** 

In [1]:
# Load required packages
import numpy as np
import pandas as pd
import psycopg2
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import metrics
import feature_engineering
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from sklearn.model_selection import (
    train_test_split, learning_curve, StratifiedShuffleSplit, GridSearchCV,
    cross_val_score
)

Let's begin by querying the training set from PostgreSQL.

In [3]:
# Set database credentials
db_name = 'section1'
usernm = 'redwan'
host = 'localhost'
port = '5432'
# pwd = ''

# Prepare a connection to a database for a campaign section
con = psycopg2.connect(
    database=db_name, 
    host='localhost',
    user=usernm,
    password=pwd
)

# Query all data from a campaign section
sql_query = 'SELECT * FROM {}'.format(db_name)
section_df_full = pd.read_sql_query(sql_query, con)

Next, let's build the design matrix.

In [4]:
# A list of features to use in the model
features = ['num_sents', 'num_words', 'num_all_caps', 'percent_all_caps',
            'num_exclms', 'percent_exclms', 'num_apple_words',
            'percent_apple_words', 'avg_words_per_sent', 'num_paragraphs',
            'avg_sents_per_paragraph', 'avg_words_per_paragraph',
            'num_images', 'num_videos', 'num_youtubes', 'num_gifs',
            'num_hyperlinks', 'num_bolded', 'percent_bolded']

# Select features
X = section_df_full[features]

Let's deal with missing data.

In [5]:
# Remove all rows with no data
X_cleaned = X[~X.isnull().all(axis=1)]

# Fill remaining missing values with zero
X_cleaned = X_cleaned.fillna(0)

We also need to standardize the features.

In [6]:
# Standardize the features
scaler = StandardScaler()
X_std = scaler.fit_transform(X_cleaned)

Next, let's collect the entries for the target variable that correspond to those in the design matrix, and store them in a separate table.

In [7]:
# Select rows of the target variable corresponding to the cleaned design matrix
y = section_df_full.loc[X_cleaned.index, 'funded'].to_frame()

Let's encode the target variable, whose contents are Booleans, as a numeric variable.

In [8]:
# Encode the class labels in the target variable
le = LabelEncoder()
y_enc = le.fit_transform(y.values.ravel())

We'll use grid search and cross-validation to determine the optimal hyperparameters for the desired model.

In [9]:
# Select a range of values for testing the hyperparameters
param_grid = [{'C': np.logspace(-3, 3, 10)}]

# Set up a grid search and cross-validation strategy
grid_search = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=param_grid,
    cv=StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=41),
    scoring='precision',
    n_jobs=-1
)

# Train a grid search model to identify optimal hyperparameters
grid_search.fit(X_std, y_enc)

# Display the optimal hyperparameters
grid_search.best_params_

{'C': 0.0046415888336127772}

Let's use the optimal hyperparameters identified by the grid search and train a final model on the complete training set. This model is ready to be deployed!

In [10]:
# Train the classifier on the entire dataset using the optimal hyperparameter
final_clf = LogisticRegression(C=grid_search.best_params_['C'])
final_clf.fit(X_std, y_enc);

Finally, let's save the classifier, in addition, to the scaler object used.

In [11]:
# Serialize the classifier and the scaler objects
joblib.dump(final_clf, 'trained_classifier.pkl')
joblib.dump(scaler, 'trained_scaler.pkl')