# Predicting Subscribers Using Random Forest Classification

Import Libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import collections
import re, math
import warnings
import matplotlib.pyplot as plot

from sklearn import preprocessing
from patsy import dmatrices
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from collections import Counter
from nltk.corpus import stopwords
from sklearn.metrics import roc_auc_score

warnings.filterwarnings('ignore')

## User Defined Functions for Preprocessing and Feature Engineering

Upload data file

In [None]:
def load_csv(csv_var_string):
    return pd.read_csv(csv_var_string)

Remove duplicates used in the synthesized user-defined functions

In [None]:
def remove_duplicates(values):
    output = []
    for value in values:
        if value not in output:
            output.append(value)
    return output

Reorganize keyword feature phrases

In [None]:
def synthesize_keywords(df_col_arg):
    converted_keywords = df_col_arg
    converted_linear_keywords = []
    for keyword in converted_keywords:
        converted_linear_keywords.append(str(keyword).lower())    
    for i in range(0,len(converted_linear_keywords)):
        converted_linear_keywords[i] = "".join(c for c in converted_linear_keywords[i] if c not in punc)
    converted_linear_keywords = [[feature for feature in clk.split() if feature not in stoplist] 
         for clk in converted_linear_keywords]
    for i in range(0,len(converted_linear_keywords)):
        converted_linear_keywords[i] = remove_duplicates(converted_linear_keywords[i])
    return converted_linear_keywords

Reorganize industry feature phrases, csv

In [None]:
def synthesize_industry(df_col_arg):
    converted_industry = df_col_arg
    converted_linear_industry = []
    for industry in converted_industry:
        converted_linear_industry.append(str(industry).lower())    
    for i in range(0,len(converted_linear_industry)):
        converted_linear_industry[i] = re.sub("[^a-zA-Z]", " ",converted_linear_industry[i] )
    for i in range(0,len(converted_linear_industry)):
        converted_linear_industry[i] = "".join(c for c in converted_linear_industry[i] if c not in punc)
    converted_linear_industry = [[feature for feature in cli.split() if feature not in stoplist] 
         for cli in converted_linear_industry]
    for i in range(0,len(converted_linear_industry)):
        converted_linear_industry[i] = remove_duplicates(converted_linear_industry[i])
    return converted_linear_industry

Reorganize seo feature phrases, csv

In [None]:
def synthesize_seo(df_col_arg):
    converted_seo = df_col_arg
    converted_linear_seo = []
    for seo in converted_seo:
        converted_linear_seo.append(str(seo).lower())    
    for i in range(0,len(converted_linear_seo)):
        converted_linear_seo[i] = re.sub("[^a-zA-Z]", " ",converted_linear_seo[i] )  # The text to search
        converted_linear_seo[i] = re.sub('null', '', converted_linear_seo[i])
        converted_linear_seo[i] = re.sub('{{pagetitledescription}}', '', converted_linear_seo[i])
        converted_linear_seo[i] = re.sub('homepage\xce\xbe', '', converted_linear_seo[i])
        converted_linear_seo[i] = re.sub('world\xce\xbe', 'world', converted_linear_seo[i])
        converted_linear_seo[i] = re.sub('worklife', 'worklife', converted_linear_seo[i])
        converted_linear_seo[i] = re.sub('wwwwepowcom', '', converted_linear_seo[i])
        converted_linear_seo[i] = re.sub('s_', '', converted_linear_seo[i])
        converted_linear_seo[i] = re.sub('metadescription', '', converted_linear_seo[i])
        converted_linear_seo[i] = re.sub('\xce\xbelemonlight', '', converted_linear_seo[i])
        converted_linear_seo[i] = re.sub('\xce', '', converted_linear_seo[i])
        converted_linear_seo[i] = re.sub('\xbe', '', converted_linear_seo[i])    
    for i in range(0,len(converted_linear_seo)):
        converted_linear_seo[i] = "".join(c for c in converted_linear_seo[i] if c not in punc)
    converted_linear_seo = [[feature for feature in cls.split() if feature not in stoplist] 
         for cls in converted_linear_seo]
    for i in range(0,len(converted_linear_seo)):
        converted_linear_seo[i] = remove_duplicates(converted_linear_seo[i])
    return converted_linear_seo

Reorganize technologies feature phrases, csv

In [None]:
def synthesize_technologies(df_col_arg):
    converted_technologies = df_col_arg
    converted_linear_technologies = []
    for tech in converted_technologies:
        converted_linear_technologies.append(str(tech).lower())    
    for i in range(0,len(converted_linear_technologies)):
        converted_linear_technologies[i] = re.sub("[^a-zA-Z]", " ",converted_linear_technologies[i])
        converted_linear_technologies[i] = re.sub('max-width', '', converted_linear_technologies[i])
    for i in range(0,len(converted_linear_technologies)):
        converted_linear_technologies[i] = "".join(c for c in converted_linear_technologies[i] if c not in punc)
    converted_linear_technologies = [[feature for feature in clt.split() if feature not in stoplist] 
         for clt in converted_linear_technologies]
    for i in range(0,len(converted_linear_technologies)):
        converted_linear_technologies[i] = remove_duplicates(converted_linear_technologies[i])
    return converted_linear_technologies 

Reorganize languages feature phrases, csv

In [None]:
def synthesize_languages(df_col_arg):
    converted_languages = df_col_arg
    converted_linear_languages = []
    for language in converted_languages:
        converted_linear_languages.append(str(language).lower())    
    for i in range(0,len(converted_linear_languages)):
        converted_linear_languages[i] = "".join(c for c in converted_linear_languages[i] if c not in punc)
    converted_linear_languages = [[feature for feature in cll.split() if feature not in stoplist] 
         for cll in converted_linear_languages]
    for i in range(0,len(converted_linear_languages)):
        converted_linear_languages[i] = remove_duplicates(converted_linear_languages[i])    
    return converted_linear_languages

Reorganize title feature phrases, csv

In [None]:
def synthesize_title(df_col_arg):
    converted_title = df_col_arg
    converted_linear_title = []
    for title in converted_title:
        converted_linear_title.append(str(title).lower())    
    for i in range(0,len(converted_linear_title)):
        converted_linear_title[i] = re.sub('/', ' ', converted_linear_title[i])
        converted_linear_title[i] = re.sub('co-', 'co', converted_linear_title[i])
        converted_linear_title[i] = re.sub('vice president', 'vp', converted_linear_title[i])
        converted_linear_title[i] = re.sub('chief innovation officer', 'cio', converted_linear_title[i])    
        converted_linear_title[i] = re.sub('chief revenue officer', 'cro', converted_linear_title[i])    
        converted_linear_title[i] = re.sub('chief executive officer', 'ceo', converted_linear_title[i])    
        converted_linear_title[i] = re.sub('chief technology officer', 'cto', converted_linear_title[i])    
        converted_linear_title[i] = re.sub('chief creative officer', 'cco', converted_linear_title[i])    
        converted_linear_title[i] = re.sub('chief operating officer', 'coo', converted_linear_title[i])    
        converted_linear_title[i] = re.sub('chief marketing officer', 'cmo', converted_linear_title[i])    
        converted_linear_title[i] = re.sub('chief digital officer', 'cdo', converted_linear_title[i])    
        converted_linear_title[i] = re.sub('chief digital transformation officer', 'cdo', converted_linear_title[i])    
        converted_linear_title[i] = re.sub('chief growth officer', 'cgo', converted_linear_title[i])    
        converted_linear_title[i] = re.sub('chief customer officer', 'ccto', converted_linear_title[i])    
        converted_linear_title[i] = re.sub('chief product officer', 'cpo', converted_linear_title[i])    
        converted_linear_title[i] = re.sub('chief problem solver', 'cps', converted_linear_title[i])    
        converted_linear_title[i] = re.sub('chief people person', '', converted_linear_title[i])    
        converted_linear_title[i] = re.sub('chief', '', converted_linear_title[i])    
    for i in range(0,len(converted_linear_title)):
        converted_linear_title[i] = remove_duplicates(converted_linear_title[i])
    for i in range(0,len(converted_linear_title)):
        if converted_linear_title[i] is "MISSING":
            converted_linear_titleloc[i] = "0"
    return converted_linear_title

Function to identify subscribers on training data

In [None]:
def identify_conversions_via_indices(df_arg_train): 
    collect_cosine_similarities = []
    conversions = []
    for idx in range(df_arg_train.index[0], df_arg_train.index[len(df_arg_train)-1]):
        if df_arg_train['converted'].loc[idx] == 1:
            conversions.append(idx)
    return conversions

Translate the continuous result from cosine similiarity to an integer range 0 --> 20

In [None]:
def binning(input_var):
    if input_var <= 0:
        return 0
    if input_var > 0 and input_var <= .05:
        return 1
    elif input_var > .05 and input_var <= .10:
        return 2
    elif input_var > .10 and input_var <= .15:
        return 3
    elif input_var > .15 and input_var <= .20:
        return 4
    elif input_var > .20 and input_var <= .25:
        return 5
    elif input_var > .25 and input_var <= .30:
        return 6
    elif input_var > .30 and input_var <= .35:
        return 7
    elif input_var > .35 and input_var <= .40:
        return 8
    elif input_var > .40 and input_var <= .45:
        return 9
    elif input_var > .45 and input_var <= .50:
        return 10
    elif input_var > .50 and input_var <= .55:
        return 11
    elif input_var > .55 and input_var <= .60:
        return 12
    elif input_var > .60 and input_var <= .65:
        return 13
    elif input_var > .65 and input_var <= .70:
        return 14
    elif input_var > .70 and input_var <= .75:
        return 15
    elif input_var > .75 and input_var <= .80:
        return 16
    elif input_var > .80 and input_var <= .85:
        return 17
    elif input_var > .85 and input_var <= .90:
        return 18
    elif input_var > .90 and input_var <= .95:
        return 19
    else:
        return 20

Transform the binning results into categorical variables for model input

In [None]:
def transform_into_categorical_vals(list_arg):
    categorical = []
    for itr_arg in list_arg:
        categorical.append(binning(itr_arg))
    return categorical

Fill voids in the original dataset to avoid processing errors

In [None]:
def fill_voids(df_arg):
    df_arg['id'].fillna(0, inplace=True)
    df_arg['first'].fillna('0', inplace=True)
    df_arg['last'].fillna('0', inplace=True)
    df_arg['title'].fillna('0', inplace=True)
    df_arg['company'].fillna('0', inplace=True)
    df_arg['size'].fillna('0', inplace=True)
    df_arg['industry'].fillna('0', inplace=True)
    df_arg['keywords'].fillna('0', inplace=True)
    df_arg['city'].fillna('0', inplace=True)
    df_arg['state'].fillna('0', inplace=True)
    df_arg['technologies'].fillna('0', inplace=True)
    df_arg['languages'].fillna('0', inplace=True)
    df_arg['funding'].fillna(0, inplace=True)
    df_arg['stage'].fillna('0', inplace=True)
    df_arg['amount'].fillna(0, inplace=True)
    df_arg['latest'].fillna('0', inplace=True)
    df_arg['seo'].fillna('0', inplace=True)
    df_arg['converted'].fillna(0, inplace=True)

Calculate cosine similarity

In [None]:
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)    

def calculate_cosine_vectors(list1_arg, idx_arg, list2_arg): 
    v1 = []
    v2 = []
    collect_cosine_similarities = []
    convert_transform = []
    for idx in range(list1_arg.index[0], list1_arg.index[len(list1_arg)-1]+1):
        v1 = text_to_vector(str(list1_arg.loc[idx]))
        for idxx in idx_arg:
            v2 = text_to_vector(str(list2_arg.loc[idxx]))
            collect_cosine_similarities.append(get_cosine(v1, v2))
        convert_transform.append(max(collect_cosine_similarities))
        collect_cosine_similarities[:] = []
    return convert_transform

Header and categorical values

In [18]:
feature_vector = ['title', 'keywords', 'industry', 'seo','technologies', 'languages']
categorical_variables = ['title', 'company', 'size', 'industry', 'keywords', 'city', 'state', 'technologies', 'languages', 'funding', 'stage', 'latest', 'seo']

Processes and punctuation tokens for reducing feature phrases into vectors 

In [None]:
punc = ['\'','!', '.', ':', ',', '\\', '&', '|', '+', '-']
stoplist = stopwords.words('english')
WORD = re.compile(r'\w+')

Upload csv file

In [3]:
csv_merge_input = 'validate.csv'

Identify the start & end of the training data and the start & end of the test data:  
train_start is always 0, 
train_end = #_of_testing_entries - 1,
test_start is always train_end + 1,
test_end = #_of_all_entries - 1

In [None]:
train_start = 0
train_end = 8999

test_start = 9000
test_end = 12289

Perform the csv upload then, synthesize and sort the data based on the columns

In [4]:
df = load_csv(csv_merge_input)
fill_voids(df)
for f_v in feature_vector:
    df[f_v] = synthesize_keywords(df[f_v])
    for idx in range(0,len(df[f_v])):
        df[f_v].loc[idx].sort()

Identify the subscribers from the training set.  Calculate the cosine similiarities then prepare for feature values into categorical variables 

In [6]:
CCTI = identify_conversions_via_indices(df.loc[train_start:train_end])
for f_v in feature_vector:
    df[f_v] = transform_into_categorical_vals(calculate_cosine_vectors(df[f_v], CCTI, df[f_v].loc[train_start:train_end]))

Remove the columns which will not be used as input to the model

In [8]:
df.drop('last', axis=1, inplace=True)
df.drop('first', axis=1, inplace=True)
df.drop('id', axis=1, inplace=True)

Pop out the subscriber results from the dataset and use it as an input to the fit model below

In [None]:
y = df.pop('converted')

Translate the feature values into categorical variables by way of 'get_dummies'

In [9]:
for variable in categorical_variables:
    dummies = pd.get_dummies(df[variable], prefix=variable)
    df = pd.concat([df, dummies], axis=1)
    df.drop([variable], axis=1, inplace=True)    

Invoke the random forest classifier

In [10]:
model = RandomForestClassifier(n_estimators=500,oob_score=True, n_jobs=-1)

Fit the data to the model (rfc)

In [11]:
model.fit(df.loc[train_start:train_end], y[train_start:train_end+1])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

Generate the solution through 'predict'

In [13]:
solution = model.predict(df.loc[test_start:test_end])
print(solution)

[ 0.  0.  0. ...,  0.  0.  0.]


Show the feature importance

In [15]:
model.feature_importances_

array([  1.12690456e-02,   2.28415487e-06,   9.88153637e-09, ...,
         3.51849286e-08,   1.26953750e-05,   6.36639356e-02])

Re-aggregate the categorical variables into feature variables and aggregate their impact

In [16]:
def graph_feature_importance(model, feature_names, autoscale=True, headroom=.09, width=10, summarized_columns=None):
    if autoscale:
        x_scale = model.feature_importances_.max()+2*headroom
    else:
        x_scale = 1
    feature_dict = dict(zip(feature_names, model.feature_importances_))
    if summarized_columns:
        for col_name in summarized_columns:
            sum_value = sum(x for i, x in feature_dict.iteritems() if col_name in i)
            keys_to_remove = [i for i in feature_dict.keys() if col_name in i]
            for i in keys_to_remove:
                feature_dict.pop(i)    
            feature_dict[col_name] = sum_value
    results = pd.Series(feature_dict.values(), index=feature_dict.keys())
    results.sort(axis=1)
    results.plot(kind="barh", figsize=(width, len(results)/4),xlim=(0,x_scale), fontsize=20)

Plot the results from the aggregated feature importances

In [17]:
graph_feature_importance(model, df.columns, summarized_columns=categorical_variables);
plot.show()