In [2]:
import os
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
from collections import Counter
from sklearn.linear_model import LinearRegression, SGDRegressor, RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import (GradientBoostingRegressor, 
                              RandomForestRegressor,
                              AdaBoostRegressor)
from sklearn.svm import SVR, SVC
import sklearn.model_selection as cv
# from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.metrics import mean_squared_error, log_loss, r2_score, mean_absolute_error
from sklearn.metrics import *
from collections import defaultdict
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler


from sklearn.datasets import fetch_openml
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [9]:
!pwd

/Users/raffi/Desktop/galvanize/predicting_engagement/notebooks


In [10]:
clean_df = pd.read_csv('~/Desktop/galvanize/predicting_engagement_v1/notebooks/csv/clean_df.csv')
mna_df = pd.read_csv('~/Desktop/galvanize/predicting_engagement_v1/notebooks/csv/mn_aesthetic.csv')
mnt_df = pd.read_csv('~/Desktop/galvanize/predicting_engagement_v1/notebooks/csv/mn_technical.csv')
x_predict = pd.read_csv('~/Desktop/galvanize/predicting_engagement_v1/notebooks/csv/x_predict.csv')
vgg19_predict = pd.read_csv('~/Desktop/galvanize/predicting_engagement_v1/notebooks/csv/vgg19_predict.csv')
x_features = pd.read_csv('~/Desktop/galvanize/predicting_engagement_v1/notebooks/csv/x_features.csv')

clean_df.shape, mna_df.shape, mnt_df.shape, x_predict.shape, vgg19_predict.shape, x_features.shape

((5637, 42), (5636, 5), (5636, 5), (5636, 3), (5636, 3), (5636, 4))

In [12]:
save_for_later = ['edge_media_to_parent_comment.count', 'edge_media_preview_comment.count',
                 'edge_media_preview_like.count', 'parent_comments_text', 'preview_comments_text',
                 'comments_text', 'bag_of_words', 'like_ratio', 'comment_ratio', 'mean_norm_like_ratio', 
                  'minmax_norm_like_ratio', 'mean_norm_comment_ratio', 'minmax_norm_comment_ratio',
                 'image_may_contain', 'username_free_caption', 'comment_hashtags', 'bow',
                 'image_label_and_caption', 'engagement_label']
keep_columns = ['owner.username', 'shortcode', 'datetime', '__typename','owner.edge_owner_to_timeline_media.count', 'owner.edge_followed_by.count',
                'tagged_users', 'caption', 'community_louvain', 'community_raffi', 'like_ratio', 'comment_ratio',
                'year', 'month', 'day_of_week', 'hour', 'caption_hashtags', 'edge_media_to_parent_comment.count', 
               'edge_media_preview_like.count', 'engagement_ratio', 'engagement_label']
df = clean_df[keep_columns].copy()
df = pd.get_dummies(df, columns = ['__typename'], prefix = ['type'], drop_first= True)
df.rename(columns = {'owner.edge_owner_to_timeline_media.count': 'posts', 'owner.edge_followed_by.count': 'followed_by',
                    'edge_media_to_parent_comment.count': 'comment_count', 'edge_media_preview_like.count': 'like_count',
                    'owner.username': 'username', 'community_louvain': 'louvain', 'community_raffi': 'raffi'}, inplace = True)
mna_df.rename(columns = {'shortcodes': 'shortcode'}, inplace = True)
mnt_df.rename(columns = {'shortcodes': 'shortcode'}, inplace = True)
df = pd.get_dummies(df, columns = ['louvain'], prefix = ['l_'], drop_first= True)
df = pd.get_dummies(df, columns = ['raffi'], prefix = ['r_'], drop_first= True)
df.fillna('', inplace = True)

#new columns
df['date'] = df['datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
df['number_of_tagged_users'] = df.tagged_users.str.split().apply(len)
df['number_of_caption_words'] = df.caption.str.split().apply(len)
df['number_of_caption_hashtags'] = df.caption_hashtags.str.split().apply(len)
df.sort_values(by = 'date', ascending = False, inplace = True)
df['days_since_last_post'] = df.groupby('username')['date'].diff().apply(lambda x: -x.days)
df = pd.merge(df, mna_df, how = 'left', on = 'shortcode')
df = pd.merge(df, mnt_df, how = 'left', on = 'shortcode')
df = pd.merge(df, x_predict, how = 'left', on = 'shortcode')
df = pd.merge(df, x_features, how = 'left', on = 'shortcode')
df['predictions'] = df['predictions'].apply(lambda x: str(x)[2:-2])
df['x_predict'] = pd.to_numeric(df['predictions'])
df = pd.merge(df, vgg19_predict, how = 'left', on = 'shortcode')
df['predictions_y'] = df['predictions_y'].apply(lambda x: str(x)[2:-2])
df['vgg19_predict'] = pd.to_numeric(df['predictions_y'])
df['new_engagement'] = df['like_ratio'] + df['comment_ratio']


df.fillna(0, inplace = True)

keep_external = ['datetime', 'posts', 'followed_by','year',
       'month', 'day_of_week', 'hour','comment_count',
       'like_count', 'engagement_ratio',
       'type_GraphSidecar', 'type_GraphVideo', 'l__1', 'l__2', 'l__3', 'l__4',
       'l__5', 'l__6', 'l__7',
       'number_of_tagged_users', 'number_of_caption_words',
       'number_of_caption_hashtags', 'days_since_last_post',
       'aesthetic_x', 'std_aesthetic_x',
       'aesthetic_y', 'std_aesthetic_y',
       'x_predict', 'vgg19_predict']

external_df = df[keep_external].copy()
external_df.rename(columns = {'number_of_tagged_users': 'tagged_users', 'number_of_caption_words': 'caption_words', 'number_of_caption_hashtags': 'caption_hashtags', 'mean_aesthetic_x': 'aesthetic', 'std_aesthetic_x': 'std_aesthetic', 'mean_aesthetic_y': 'technical', 'std_aesthetic_y': 'std_technical'}, inplace = True)

external_df.to_csv('~/Desktop/galvanize/capstone_3/external_df.csv')

KeyError: "['std_aesthetic_x', 'std_aesthetic_y', 'aesthetic_y', 'aesthetic_x'] not in index"

In [8]:
df = pd.read_csv('~/Desktop/galvanize/capstone_3/new_df.csv')

In [5]:
df_numeric = external_df.select_dtypes(include = np.number)

In [6]:
df_numeric.columns

Index(['posts', 'followed_by', 'year', 'month', 'day_of_week', 'hour',
       'comment_count', 'like_count', 'engagement_ratio', 'type_GraphSidecar',
       'type_GraphVideo', 'l__1', 'l__2', 'l__3', 'l__4', 'l__5', 'l__6',
       'l__7', 'tagged_users', 'caption_words', 'caption_hashtags',
       'days_since_last_post', 'aesthetic', 'std_aesthetic', 'technical',
       'std_technical', 'x_predict', 'vgg19_predict'],
      dtype='object')

In [7]:
df_numeric.columns.drop('like_count', 'comment_count')

Index(['posts', 'followed_by', 'year', 'month', 'day_of_week', 'hour',
       'comment_count', 'engagement_ratio', 'type_GraphSidecar',
       'type_GraphVideo', 'l__1', 'l__2', 'l__3', 'l__4', 'l__5', 'l__6',
       'l__7', 'tagged_users', 'caption_words', 'caption_hashtags',
       'days_since_last_post', 'aesthetic', 'std_aesthetic', 'technical',
       'std_technical', 'x_predict', 'vgg19_predict'],
      dtype='object')

In [13]:
lr = LinearRegression()
rf = RandomForestRegressor(n_estimators = 2000, n_jobs = -1)
knn = KNeighborsRegressor()

models = [lr, rf, knn]
model_names = ['lr', 'rf', 'knn']

X_columns = ['posts', 'followed_by', 'year', 'month', 'day_of_week', 'hour','type_GraphSidecar',
             'type_GraphVideo', 'l__1', 'l__2', 'l__3', 'l__4', 'l__5', 'l__6', 'l__7',
             'tagged_users', 'caption_words', 'caption_hashtags','days_since_last_post', 
             'aesthetic', 'std_aesthetic', 'technical','std_technical', 'x_predict', 'vgg19_predict']

def get_train_test_split(df):
    """Get numerical columns from df, return X and y"""

    X = df_numeric[X_columns]
    Y = df_numeric['engagement_ratio']
    X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
    return X_train, X_test, y_train, y_test

def run_model(models, X_train, X_test, y_train, y_test):
    for model, model_name in zip(models, model_names):
        model.fit(X_train, y_train)
        yhat = model.predict(X_test)
        mse = mean_squared_error(y_test, yhat)
        mae = mean_absolute_error(y_test, yhat)
        r2 = r2_score(y_test, yhat)
        rmse = np.sqrt(mse)
        print(f'{model_name} r2: {r2}, mae: {mae}, mse: {mse}, rmse: {rmse}')
        
    A = df_numeric[X_columns]
    b = df_numeric['engagement_ratio']
    A = np.column_stack([np.ones(A.shape[0]), A])
    
    X_train, X_test, y_train, y_test = train_test_split(A, b, test_size=0.20, random_state=42)
    # calculate the economy SVD for the data matrix A
    U,S,Vt = np.linalg.svd(X_train, full_matrices=False)
    # solve Ax = b for the best possible approximate solution in terms of least squares
    x_hat = Vt.T @ np.linalg.inv(np.diag(S)) @ U.T @ y_train
    # perform train and test inference
    y_pred = X_train @ x_hat
    yhat = X_test @ x_hat
    mse = mean_squared_error(y_test, yhat)
    mae = mean_absolute_error(y_test, yhat)
    r2 = r2_score(y_test, yhat)
    rmse = np.sqrt(mse)
    print(f'svd r2: {r2}, mae: {mae}, mse: {mse}, rmse: {rmse}')    

In [14]:
X_train, X_test, y_train, y_test = get_train_test_split(df)
run_model(models, X_train, X_test, y_train, y_test)

lr r2: 0.1035265093193779, mae: 1.2125862476632085, mse: 3.1834786617177406, rmse: 1.7842305517274781
rf r2: 0.4350189648379781, mae: 0.9702803588759809, mse: 2.006311495444174, rmse: 1.4164432552856376
knn r2: 0.29667926016388624, mae: 1.0333765000231783, mse: 2.49757141832705, rmse: 1.5803706585250974
svd r2: 0.10352650998370971, mae: 1.2125862466816244, mse: 3.1834786593586237, rmse: 1.784230551066376


In [1]:
corr = df_numeric.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.7, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5});

NameError: name 'df_numeric' is not defined