# Import Libraries

In [1]:
import numpy as np
import pandas as pd
pd.set_option("max_columns", None)
from scipy.stats import chi2_contingency
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from keras.layers.normalization import BatchNormalization
from keras.layers.core import Activation, Dropout
from keras.optimizers import Adadelta
from keras.layers.advanced_activations import LeakyReLU
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


# Import Data

In [2]:
data = pd.read_csv('fashion_data_on_Instagram.csv')
data.set_index('UserId', inplace = True)
data.head()

Unnamed: 0_level_0,Followings,Followers,MediaCount,BrandName,BrandCategory,Hashtags,Caption,ImgURL,Likes,Comments,CreationTime,Link,Selfie,BodySnap,Marketing,ProductOnly,NonFashion,Face,Logo,BrandLogo,Smile,Outdoor,NumberOfPeople,NumberOfFashionProduct,Anger,Contempt,Disgust,Fear,Happiness,Neutral,Sadness,Surprise
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
1171579752855683619_212070047,518.0,9840.0,541,abercrombie,High street,"beautiful, summer, fashion, love, cute, food, ...","We were born to be REAL, not to be perfect. ...",https://scontent.cdninstagram.com/t51.2885-15/...,97,0,1453883211,https://www.instagram.com/p/BBCSZPzAMIj/,0.000885,0.336438,0.74122,0.009454,0.001827,0.839466,0.890231,0.292843,0.062638,0.012164,0.931486,3.42211,0.083862,0.015089,0.000583,2.6e-05,1.7e-05,0.890586,0.009657,0.000181
1171594777274371222_176762322,7333.0,2300.0,272,abercrombie,High street,"teen, model, brunette, selfie, hollister, snap...","Gotta run, but first, let me take selfie. Me...",https://scontent.cdninstagram.com/t51.2885-15/...,94,0,1453885002,https://www.instagram.com/p/BBCVz4YJEyW/,0.003912,0.995503,0.001728,0.002125,0.002438,0.480598,0.550629,0.063139,0.126848,0.00433,0.466329,2.91971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1171407552643586413_581125501,131.0,605.0,106,abercrombie,High street,"alexandani, cute, llbean, beanboots, anthropol...",dress to impress\U0001f457,https://scontent.cdninstagram.com/t51.2885-15/...,91,3,1453862683,https://www.instagram.com/p/BBBrPZ2h9lt/,0.015774,0.96388,0.025719,0.027023,0.003409,0.283399,0.095724,0.037635,0.107896,0.712017,1.1165,4.69096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1171407552643586413_581125501,131.0,605.0,106,abercrombie,High street,"alexandani, cute, llbean, beanboots, anthropol...",dress to impress\U0001f457,https://scontent.cdninstagram.com/t51.2885-15/...,94,3,1453862683,https://www.instagram.com/p/BBBrPZ2h9lt/,0.015774,0.96388,0.025719,0.027023,0.003409,0.283399,0.095724,0.037635,0.107896,0.712017,1.1165,4.69096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1171508187966229230_2797323089,132.0,456.0,56,abercrombie,High street,"abercrombieandfitch, shopping, love, hollister...",#love #shopping #shoppen #hollister #abercro...,https://scontent.cdninstagram.com/t51.2885-15/...,9,1,1453874680,https://www.instagram.com/p/BBCCH1zMEru/,0.002021,0.014188,0.028351,0.418125,0.601519,0.01084,0.994191,0.332847,0.007911,0.002386,0.393063,1.45586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Clean Data

### Let's See if there are any NaNs in the data

In [3]:
set(data['Hashtags'].isna())

{False, True}

In [4]:
set(data['Caption'].isna())

{False, True}

In [5]:
set(data['Comments '].isna())

{False}

##### So there exist NaN values in Hashtags and Captions. There are no other columns that contain NaN.

In [6]:
data.isna().groupby('Hashtags')['Followings'].count()

Hashtags
False    24606
True       146
Name: Followings, dtype: int64

##### There are 146 NaN rows in hashtags. It is possible that this is simply due to the fact that some posts do not have hashtags.

In [7]:
data.isna().groupby('Caption')['Followings'].count()

Caption
False    24717
True        35
Name: Followings, dtype: int64

##### There are 35 NaN rows in Caption

### Maybe the caption or hashtag information is missing because there is either only a caption or only hashtags....

In [8]:
hashtag_nan = list(data['Hashtags'].isna())
caption_nan = list(data['Caption'].isna())

In [9]:
count = 0
for i in range(len(hashtag_nan)):
    if hashtag_nan[i] and caption_nan[i]:
        count += 1
print('The number of times both hashtag_nan and caption_nan is true is', count)

The number of times both hashtag_nan and caption_nan is true is 18


##### So we have missing hashtag both 18/35 times that we have a missing caption. So it is not exactly correlated. Maybe we can just drop the rows that have missing caption or hashtag if we need to use these, or claim that there was not hashtag or caption for these rows.

### I want to try and see if I can make brand category numeric

In [3]:
brand_category = pd.get_dummies(data['BrandCategory'])

In [4]:
brand_category.head()

Unnamed: 0_level_0,Designer,High street,Mega couture,Small couture
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1171579752855683619_212070047,0,1,0,0
1171594777274371222_176762322,0,1,0,0
1171407552643586413_581125501,0,1,0,0
1171407552643586413_581125501,0,1,0,0
1171508187966229230_2797323089,0,1,0,0


In [5]:
data_new = pd.concat([data, brand_category], sort = False, axis = 1)
data_new.head()

Unnamed: 0_level_0,Followings,Followers,MediaCount,BrandName,BrandCategory,Hashtags,Caption,ImgURL,Likes,Comments,CreationTime,Link,Selfie,BodySnap,Marketing,ProductOnly,NonFashion,Face,Logo,BrandLogo,Smile,Outdoor,NumberOfPeople,NumberOfFashionProduct,Anger,Contempt,Disgust,Fear,Happiness,Neutral,Sadness,Surprise,Designer,High street,Mega couture,Small couture
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
1171579752855683619_212070047,518.0,9840.0,541,abercrombie,High street,"beautiful, summer, fashion, love, cute, food, ...","We were born to be REAL, not to be perfect. ...",https://scontent.cdninstagram.com/t51.2885-15/...,97,0,1453883211,https://www.instagram.com/p/BBCSZPzAMIj/,0.000885,0.336438,0.74122,0.009454,0.001827,0.839466,0.890231,0.292843,0.062638,0.012164,0.931486,3.42211,0.083862,0.015089,0.000583,2.6e-05,1.7e-05,0.890586,0.009657,0.000181,0,1,0,0
1171594777274371222_176762322,7333.0,2300.0,272,abercrombie,High street,"teen, model, brunette, selfie, hollister, snap...","Gotta run, but first, let me take selfie. Me...",https://scontent.cdninstagram.com/t51.2885-15/...,94,0,1453885002,https://www.instagram.com/p/BBCVz4YJEyW/,0.003912,0.995503,0.001728,0.002125,0.002438,0.480598,0.550629,0.063139,0.126848,0.00433,0.466329,2.91971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
1171407552643586413_581125501,131.0,605.0,106,abercrombie,High street,"alexandani, cute, llbean, beanboots, anthropol...",dress to impress\U0001f457,https://scontent.cdninstagram.com/t51.2885-15/...,91,3,1453862683,https://www.instagram.com/p/BBBrPZ2h9lt/,0.015774,0.96388,0.025719,0.027023,0.003409,0.283399,0.095724,0.037635,0.107896,0.712017,1.1165,4.69096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
1171407552643586413_581125501,131.0,605.0,106,abercrombie,High street,"alexandani, cute, llbean, beanboots, anthropol...",dress to impress\U0001f457,https://scontent.cdninstagram.com/t51.2885-15/...,94,3,1453862683,https://www.instagram.com/p/BBBrPZ2h9lt/,0.015774,0.96388,0.025719,0.027023,0.003409,0.283399,0.095724,0.037635,0.107896,0.712017,1.1165,4.69096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
1171508187966229230_2797323089,132.0,456.0,56,abercrombie,High street,"abercrombieandfitch, shopping, love, hollister...",#love #shopping #shoppen #hollister #abercro...,https://scontent.cdninstagram.com/t51.2885-15/...,9,1,1453874680,https://www.instagram.com/p/BBCCH1zMEru/,0.002021,0.014188,0.028351,0.418125,0.601519,0.01084,0.994191,0.332847,0.007911,0.002386,0.393063,1.45586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0


##### Now the brand category is represented in the last four columns of the dataframe with one_hot_encoding.

### In order to model with these we need to normalize the variables

In [13]:
for col in data_new.columns:
    if col not in ['BrandName', 'BrandCategory','Hashtags', 'Caption', 'ImgURL', 'Likes', 'Comments ', 'CreationTime','Link','Designer',
       'High street', 'Mega couture', 'Small couture']:
        data_new[col] = data_new[col]/np.linalg.norm(np.array(data_new[col], dtype = float))

# Model with Numeric Data Only

First, lets try to build a model with only the numeric data. This will be easy to do and will allow us to understand whether any of these numeric factors influence the number of likes/comments a user gets. 

### Likes Model

In [13]:
X = data_new[['Selfie', 'BodySnap', 'Marketing', 'ProductOnly', 'NonFashion',
       'Face', 'Logo', 'BrandLogo', 'Smile', 'Outdoor', 'NumberOfPeople',
       'NumberOfFashionProduct', 'Anger', 'Contempt', 'Disgust', 'Fear',
       'Happiness', 'Neutral', 'Sadness', 'Surprise', 'Designer',
       'High street', 'Mega couture', 'Small couture']]
y = data_new['Likes']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Let's use a Lasso Regression model to regress based on the likes that we receive. I want to use lasso because lasso will send the unimportant variables to zero allowing us to see which features are important. 

In [9]:
lasso = Lasso(alpha = 0.0001, max_iter = 10000).fit(X_train, y_train)

In [10]:
predictions = lasso.predict((X_test))
r2_score(y_test, predictions)

0.2984467039649339

Maybe ridge regression will work better?

In [11]:
ridge = Ridge(alpha = 0.0001).fit(X_train, y_train)

In [12]:
predictions = ridge.predict((X_test))
r2_score(y_test, predictions)

0.29844937687206075

So ridge regression is really no different...

### Try a Deep Learning Model

In [54]:
def our_model():
    model = Sequential()

    model.add(Dense(units=640, activation='tanh', input_dim = 24))
    model.add(Dense(units=640, activation='tanh'))
    model.add(Dense(units=640, activation='tanh'))
    model.add(Dense(units=1))
    model.compile(loss='mean_squared_error', optimizer='adam',metrics=['accuracy'])
    return model

In [55]:
estimator = KerasRegressor(build_fn=our_model, epochs=10, batch_size=500, verbose=1)

In [56]:
from sklearn.model_selection import KFold, cross_val_score

kfold = KFold(n_splits=10)
results = cross_val_score(estimator, X_train, y_train, cv=kfold)
print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Epoch 1/10
Epoch 2/10
Epoch 3/10

KeyboardInterrupt: 

Maybe for the neural network model to work we need to normalize the outputs as well??? I have only normalized the input space. Maybe we need to investigate how the input space was normalized. Maybe the categorical variables that are represented by 1's and 0's at the end of the dataframe need to be normalized?

Average loss value for Keras deep learning model based on just numeric values and predicting likes is 154K.

Maybe try using likes / followers?

# Likes to Follower Ratio

First check if there are any null likes or follower entries

In [81]:
data_new['Likes'].isnull().any() 

False

In [82]:
data_new['Followers'].isnull().any()

False

In [83]:
data_ver2 = pd.concat([data, brand_category], sort = False, axis = 1)
data_ver2['LikesToFollowers'] = data_ver2['Likes'] / data_ver2['Followers']
data_ver2.head()

Unnamed: 0_level_0,Followings,Followers,MediaCount,BrandName,BrandCategory,Hashtags,Caption,ImgURL,Likes,Comments,CreationTime,Link,Selfie,BodySnap,Marketing,ProductOnly,NonFashion,Face,Logo,BrandLogo,Smile,Outdoor,NumberOfPeople,NumberOfFashionProduct,Anger,Contempt,Disgust,Fear,Happiness,Neutral,Sadness,Surprise,Designer,High street,Mega couture,Small couture,LikesToFollowers
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
1171579752855683619_212070047,518.0,9840.0,541,abercrombie,High street,"beautiful, summer, fashion, love, cute, food, ...","We were born to be REAL, not to be perfect. ...",https://scontent.cdninstagram.com/t51.2885-15/...,97,0,1453883211,https://www.instagram.com/p/BBCSZPzAMIj/,0.000885,0.336438,0.74122,0.009454,0.001827,0.839466,0.890231,0.292843,0.062638,0.012164,0.931486,3.42211,0.083862,0.015089,0.000583,2.6e-05,1.7e-05,0.890586,0.009657,0.000181,0,1,0,0,0.009858
1171594777274371222_176762322,7333.0,2300.0,272,abercrombie,High street,"teen, model, brunette, selfie, hollister, snap...","Gotta run, but first, let me take selfie. Me...",https://scontent.cdninstagram.com/t51.2885-15/...,94,0,1453885002,https://www.instagram.com/p/BBCVz4YJEyW/,0.003912,0.995503,0.001728,0.002125,0.002438,0.480598,0.550629,0.063139,0.126848,0.00433,0.466329,2.91971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0.04087
1171407552643586413_581125501,131.0,605.0,106,abercrombie,High street,"alexandani, cute, llbean, beanboots, anthropol...",dress to impress\U0001f457,https://scontent.cdninstagram.com/t51.2885-15/...,91,3,1453862683,https://www.instagram.com/p/BBBrPZ2h9lt/,0.015774,0.96388,0.025719,0.027023,0.003409,0.283399,0.095724,0.037635,0.107896,0.712017,1.1165,4.69096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0.150413
1171407552643586413_581125501,131.0,605.0,106,abercrombie,High street,"alexandani, cute, llbean, beanboots, anthropol...",dress to impress\U0001f457,https://scontent.cdninstagram.com/t51.2885-15/...,94,3,1453862683,https://www.instagram.com/p/BBBrPZ2h9lt/,0.015774,0.96388,0.025719,0.027023,0.003409,0.283399,0.095724,0.037635,0.107896,0.712017,1.1165,4.69096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0.155372
1171508187966229230_2797323089,132.0,456.0,56,abercrombie,High street,"abercrombieandfitch, shopping, love, hollister...",#love #shopping #shoppen #hollister #abercro...,https://scontent.cdninstagram.com/t51.2885-15/...,9,1,1453874680,https://www.instagram.com/p/BBCCH1zMEru/,0.002021,0.014188,0.028351,0.418125,0.601519,0.01084,0.994191,0.332847,0.007911,0.002386,0.393063,1.45586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0.019737


In [115]:
data_ver2.replace([np.inf, -np.inf], np.nan, inplace = True);

In [116]:
data_test = data_ver2[data_ver2['LikesToFollowers'].isnull()]
data_test.head()

Unnamed: 0_level_0,Followings,Followers,MediaCount,BrandName,BrandCategory,Hashtags,Caption,ImgURL,Likes,Comments,CreationTime,Link,Selfie,BodySnap,Marketing,ProductOnly,NonFashion,Face,Logo,BrandLogo,Smile,Outdoor,NumberOfPeople,NumberOfFashionProduct,Anger,Contempt,Disgust,Fear,Happiness,Neutral,Sadness,Surprise,Designer,High street,Mega couture,Small couture,LikesToFollowers
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
1171240019544303946_23457617,0.0,0.0,1,acnestudios,Designer,"shop, anndemeulemeester, style, acnestudios, t...",Black? Or dark black... @00000 #totokaelo #l...,https://scontent.cdninstagram.com/t51.2885-15/...,37,0,1453842712,https://www.instagram.com/p/BBBFJefqOlK/,0.000158,0.001617,0.007038,0.997941,0.001827,0.000604,0.0996,0.073979,0.00038,0.001582,0.192986,3.91808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,
1171142669094830425_252313279,36.0,0.0,0,acnestudios,Designer,"acnestudios, collegeface",#collegeface in college\U0001f610 #acnestudios,https://scontent.cdninstagram.com/t51.2885-15/...,18,1,1453831107,https://www.instagram.com/p/BBAvA10s0FZ/,0.87076,0.20023,0.063385,0.003886,0.016565,0.991469,0.052578,0.024295,0.510374,0.032716,1.75241,2.34389,0.000330862,0.00389,4.89e-05,0.000497548,1.8e-05,0.829768,0.150697,0.014749,1,0,0,0,
1170503373243750859_2795691477,0.0,0.0,1,acnestudios,Designer,"sorpreseinmacchina, acnestudios, withstyle, ga...",#jackolsen #jauria #withstyle #migraineprobl...,https://scontent.cdninstagram.com/t51.2885-15/...,10,0,1453754897,https://www.instagram.com/p/BA-dp3KlxnL/,0.375648,0.528273,0.055469,0.001185,0.009081,0.996861,0.134891,0.024669,0.513444,0.315825,1.41385,4.22136,2.08e-07,0.000181,8.39e-07,4.16e-09,0.979905,0.019908,3e-06,1e-06,1,0,0,0,
1170007750938083500_2120361935,39.0,0.0,0,alexanderwang,Designer,"jimmychoo, slimthickwithyourcuteass, alexander...","u\""\""I swear she's so perfect she makes me s...",https://scontent.cdninstagram.com/t51.2885-15/...,74,0,1453695814,https://www.instagram.com/p/BA8s9m7HkCs/,0.002184,0.999626,0.000747,0.000168,0.000336,0.771275,0.02662,0.001609,0.331313,0.013736,0.667217,4.49539,0.000857398,0.001655,0.000445726,0.002339453,0.008483,0.939227,0.021567,0.025426,1,0,0,0,
1170482637253688665_365119398,2.0,0.0,0,alexanderwang,Designer,"makeup, fashion, motd, alexanderwang, ysl, fas...",Simply Slaying. #fashion #fashionable #fashi...,https://scontent.cdninstagram.com/t51.2885-15/...,65,1,1453752425,https://www.instagram.com/p/BA-Y8HRRL1Z/,0.000448,0.001546,0.303054,0.946735,0.007072,0.026929,0.993566,0.352947,0.004214,0.002586,0.379229,2.78594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,


Drop rows where LikesToFollowers is NAN. Likes and Followers count at both 0.

In [117]:
data_ver2 = data_ver2.dropna()

In [118]:
data_ver2['LikesToFollowers'].isnull().any()

False

In [119]:
data_ver2['LikesToFollowers']

UserId
1171579752855683619_212070047     0.009858
1171594777274371222_176762322     0.040870
1171407552643586413_581125501     0.150413
1171407552643586413_581125501     0.155372
1171508187966229230_2797323089    0.019737
                                    ...   
1171575731978099465_389885373     0.000000
1171566570370586806_253427645     0.000000
1171566525099211846_1433754542    0.000000
1171566525099211846_1433754542    0.000000
1171565778125281443_253427645     0.000000
Name: LikesToFollowers, Length: 24543, dtype: float64

In [120]:
X = data_ver2[['Selfie', 'BodySnap', 'Marketing', 'ProductOnly', 'NonFashion',
       'Face', 'Logo', 'BrandLogo', 'Smile', 'Outdoor', 'NumberOfPeople',
       'NumberOfFashionProduct', 'Anger', 'Contempt', 'Disgust', 'Fear',
       'Happiness', 'Neutral', 'Sadness', 'Surprise', 'Designer',
       'High street', 'Mega couture', 'Small couture']]
y = data_ver2['LikesToFollowers']

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [122]:
def standardize(train, test):


    mean = np.mean(train, axis=0)
    std = np.std(train, axis=0)+0.000001

    X_train = (train - mean) / std
    X_test = (test - mean) /std
    return X_train, X_test

In [123]:
X_train, X_test = standardize(X_train, X_test)

In [124]:
X_train

Unnamed: 0_level_0,Selfie,BodySnap,Marketing,ProductOnly,NonFashion,Face,Logo,BrandLogo,Smile,Outdoor,NumberOfPeople,NumberOfFashionProduct,Anger,Contempt,Disgust,Fear,Happiness,Neutral,Sadness,Surprise,Designer,High street,Mega couture,Small couture
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1171507821618345730_1574502508,-0.259308,-0.824482,-0.528553,1.122620,0.706286,-0.819929,1.338838,-0.632729,-0.592633,-0.499348,-0.957247,-1.366377,-0.102171,-0.123572,-0.088569,-0.057461,-0.275807,-0.429826,-0.160384,-0.105231,-0.908278,1.832815,-0.284469,-0.567634
1171231788214163629_193097469,-0.257499,-0.821518,-0.395204,1.372306,-0.469912,-0.817212,0.905760,0.096165,-0.589268,-0.490598,-0.765123,0.273672,-0.102171,-0.123572,-0.088569,-0.057461,-0.275807,-0.429826,-0.160384,-0.105231,-0.908278,1.832815,-0.284469,-0.567634
1170757716739212431_33474720,-0.083411,1.660241,-0.434811,-0.930392,-0.429279,1.613308,-0.885282,-0.690709,3.174731,0.134557,2.035478,1.229400,0.048616,-0.028745,0.334422,-0.029306,-0.266855,2.634139,-0.145332,0.123645,-0.908278,1.832815,-0.284469,-0.567634
1160779262971561635_1558220976,-0.235183,0.118235,-0.324666,-0.438253,-0.401616,-0.640163,-1.157490,-0.588286,-0.513935,0.260650,0.004726,0.557519,-0.102171,-0.123572,-0.088569,-0.057461,-0.275807,-0.429826,-0.160384,-0.105231,-0.908278,-0.545606,-0.284469,1.761691
1171512225661878713_1643536695,0.009773,-0.367641,0.564813,-0.880679,0.103549,0.873789,-0.031832,-0.240964,0.174241,-0.148754,0.662870,-0.334646,0.461866,0.228294,1.153740,0.044043,2.030822,1.025846,0.038936,0.488148,1.100980,-0.545606,-0.284469,-0.567634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171371918978904970_1255231723,-0.128482,1.514154,-0.311073,-0.922707,-0.287621,1.267836,-0.102080,-0.339481,0.785832,0.174616,1.066442,0.270878,0.014871,-0.075847,-0.026543,0.274548,-0.251489,2.459536,0.150355,1.629310,-0.908278,1.832815,-0.284469,-0.567634
1101082779343622525_498541251,-0.245017,1.223258,-0.471399,-0.644079,-0.470364,-0.758604,-1.276440,-0.775869,-0.543664,-0.432439,-0.455082,0.531988,-0.102171,-0.123572,-0.088569,-0.057461,-0.275807,-0.429826,-0.160384,-0.105231,-0.908278,-0.545606,-0.284469,1.761691
1171555437158436633_205040098,-0.257739,-0.819451,-0.507412,1.364928,-0.465846,-0.818924,-0.843107,-0.214763,-0.591105,-0.451280,-0.836088,-0.276188,-0.102171,-0.123572,-0.088569,-0.057461,-0.275807,-0.429826,-0.160384,-0.105231,1.100980,-0.545606,-0.284469,-0.567634
1169853176213918638_1532376523,5.020211,-0.143800,-0.077522,-0.921410,-0.338819,1.562264,-0.969135,-0.541927,1.939863,-0.419760,2.027119,-0.731254,-0.102171,-0.123572,-0.088569,-0.057461,-0.275807,-0.429826,-0.160384,-0.105231,1.100980,-0.545606,-0.284469,-0.567634


In [126]:
import keras
keras.backend.floatx()
keras.backend.set_floatx('float64')
keras.backend.floatx()

'float64'

In [130]:
def our_model_ver2():
    model = Sequential()
    model.add(Dense(units=64, input_dim = 24))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(0.25))

    model.add(Dense(units=64))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(0.5))
    
    model.add(Dense(units=1))
    model.add(LeakyReLU(alpha=0.1))
    opt = Adadelta(clipnorm=0.3)
    model.compile(loss='mean_squared_error', optimizer=opt,metrics=['accuracy'])
    return model

estimator = KerasRegressor(build_fn=our_model_ver2, epochs=5, batch_size=50, verbose=1)

kfold = KFold(n_splits=5)
results = cross_val_score(estimator, X_train, y_train, cv=kfold)
print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Baseline: -21.22 (11.47) MSE
