## Loading the packages

In [406]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

## Importing the dataset

In [407]:
df = pd.read_csv('train.csv')
df = df.dropna()
df.head()

Unnamed: 0,Choice,A_follower_count,A_following_count,A_listed_count,A_mentions_received,A_retweets_received,A_mentions_sent,A_retweets_sent,A_posts,A_network_feature_1,...,B_following_count,B_listed_count,B_mentions_received,B_retweets_received,B_mentions_sent,B_retweets_sent,B_posts,B_network_feature_1,B_network_feature_2,B_network_feature_3
0,0,228,302,3,0.583979,0.100503,0.100503,0.100503,0.36215,2,...,29808,1689,15.430498,3.984029,8.204331,0.332423,6.988815,66,75.530303,1916.893939
1,0,21591,1179,228,90.456506,25.798292,5.709329,1.111159,5.17662,369,...,848,1610,40.495021,8.943607,3.227677,0.564343,1.070321,163,132.030675,2931.515337
2,0,7310,1215,101,25.503644,9.556347,5.361519,0.591206,3.589718,95,...,482,206,0.734696,0.354379,0.603202,0.100503,0.603202,3,10.333333,277.333333
3,0,20,7,2,7.690824,0.277306,1.331508,0.100503,2.830627,6,...,17637,278,572.874856,390.293681,27.55204,7.167557,32.101906,1762,23.469296,1395.845634
4,1,45589,862,2641,148.854279,36.998884,27.881768,3.333492,23.861282,551,...,1711,181,21.601866,3.581661,6.764657,1.119727,4.563246,85,48.5,1993.627907


## Feature Engineering:

In this section, we are creating new features based on the features available in the dataset. Since the goal is to identify whether influencer A is more influent than influencer B, instead of having individual metrics for both influencers, we create comparison metrics. For instance, instead of having the **individual follower count** for each influencer, we have the **difference** in followers count between influencers A and B. The ratio could have also been calculated, but we chose to go with the difference, to avoid potential divisions by 0. 

We tried engineering more advanced features, such as the level of interaction, defined by the sum of retweets and mentions, divided by the total number of followers for a given influencer. However, as will be shown further down, those features did not have a significant importance in predicting influence. 

In [408]:
#follower count difference
df['Follower Count Difference'] = df['A_follower_count'] - df['B_follower_count'] 
#following count difference
df['Followig Count Difference'] = df['A_following_count'] - df['B_following_count'] 
#listed count difference
df['Listed Count Difference'] = df['A_listed_count'] - df['B_listed_count'] 
#received retweets difference
df['Received Retweets Difference'] = df['A_retweets_received'] - df['B_retweets_received'] 
#received mention difference
df['Received Mentions Difference'] = df['A_mentions_received'] - df['B_mentions_received']
#sent retweets difference
df['Sent Retweets Difference'] = df['A_retweets_sent'] - df['B_retweets_sent'] 
#sent mentions difference
df['Sent Mentions Difference'] = df['A_mentions_sent'] - df['B_mentions_sent'] 
#post difference
df['Post Difference'] = df['A_posts'] - df['B_posts']

###################################################################################################################
#Ratio of interactions (retweets received + mentions received)/posts -- NOT VERY USEFUL
#df['Interaction Ratio A'] = (df['A_retweets_received'] + df['A_mentions_received'])/df['A_follower_count']
#df['Interaction Ratio B'] = (df['B_retweets_received'] + df['B_mentions_received'])/df['B_follower_count']

#Listed/follower ratio --- NOT VERY USEFUL
#df['LF Ratio A'] = df['A_listed_count']/df['A_follower_count']
#df['LF Ratio B'] = df['B_listed_count']/df['B_follower_count']

#ratio of infulencer's posts that are retweeted ---- NOT VERY USEFUL
#df['Retweet % A'] = df['A_retweets_received']/df['A_posts']
#df['Retweet % B'] = df['B_retweets_received']/df['B_posts']

#Calculating following/followers ratio for each influencer ------ NOT VERY USEFUL
#df['following/followers_A'] = df['A_following_count']/df['A_follower_count']
#df['following/followers_B'] = df['B_following_count']/df['B_follower_count']

In [409]:
#Dropping the features that were precursors to the newly designed features:

df = df.drop(columns = ['A_follower_count',
                   'A_following_count',
                   'A_listed_count',
                   'A_mentions_received',
                   'A_retweets_received',
                   'A_mentions_sent',
                   'A_retweets_sent',
                   'A_posts', 
                   'B_follower_count',
                   'B_following_count',
                   'B_listed_count',
                   'B_mentions_received',
                   'B_retweets_received',
                   'B_mentions_sent',
                   'B_retweets_sent',
                   'B_posts'])
df.head()

Unnamed: 0,Choice,A_network_feature_1,A_network_feature_2,A_network_feature_3,B_network_feature_1,B_network_feature_2,B_network_feature_3,Follower Count Difference,Followig Count Difference,Listed Count Difference,Received Retweets Difference,Received Mentions Difference,Sent Retweets Difference,Sent Mentions Difference,Post Difference
0,0,2,166.5,11355.0,66,75.530303,1916.893939,-34235,-29506,-1686,-3.883525,-14.846518,-0.23192,-8.103828,-6.626665
1,0,369,18.442971,1330.366048,163,132.030675,2931.515337,-17671,331,-1382,16.854685,49.961485,0.546816,2.481652,4.106299
2,0,95,68.927835,5999.896907,3,10.333333,277.333333,3688,733,-105,9.201969,24.768949,0.490702,4.758317,2.986516
3,0,6,2.0,96.166667,1762,23.469296,1395.845634,-19542,-17630,-276,-390.016375,-565.184032,-7.067053,-26.220532,-29.271279
4,1,551,127.404293,2833.847943,85,48.5,1993.627907,38035,-849,2460,33.417223,127.252413,2.213765,21.117111,19.298035


## Feature Importance

In order to compute the relative important of each feature in our newly engineered dataset, an initial random forest model was run. That way, we have a quantitative way of assessing the importance of each predictor, which will reinforce/confirm domain knowledge when identifying the most important predictors of influence.

In [410]:
X = df.drop(columns = 'Choice')
y = df['Choice']

scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [411]:
rf = RandomForestClassifier(random_state = 0, n_estimators = 500)
model = rf.fit(X_std,y)

model.feature_importances_

importances = pd.DataFrame(list(zip(X.columns,model.feature_importances_)), columns = ['predictor','feature importance'])

importances.sort_values(by = 'feature importance', ascending = False)

Unnamed: 0,predictor,feature importance
8,Listed Count Difference,0.148427
6,Follower Count Difference,0.121338
10,Received Mentions Difference,0.110956
9,Received Retweets Difference,0.099928
3,B_network_feature_1,0.070874
0,A_network_feature_1,0.069579
13,Post Difference,0.055813
12,Sent Mentions Difference,0.054854
7,Followig Count Difference,0.053043
2,A_network_feature_3,0.044415


## Training and testing the model

In [412]:
from sklearn.model_selection import train_test_split

X = df[['Listed Count Difference',
        'Follower Count Difference',
        'Received Mentions Difference',
        'Received Retweets Difference',
        'B_network_feature_1',
        'A_network_feature_1',
        'Post Difference']]

y = df['Choice']

In [413]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.05, random_state=5)

X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

model = rf.fit(X_train_std,y_train)
y_pred = model.predict(X_test_std)

print(round(accuracy_score(y_pred,y_test),3)*100,'% accuracy')

83.3 % accuracy


## Getting the wrong observations

In [400]:
wrong_observations = []

for i in range(len(y_pred)):
    if (y_test.iloc[i] != y_pred[i]):
        wrong_observations.append(y_test.index[i])
wrong_observations

[3552,
 1434,
 4005,
 2060,
 3941,
 2614,
 767,
 3372,
 674,
 2983,
 5031,
 2693,
 5437,
 769,
 3994,
 5415,
 3409,
 4393,
 1752,
 2191,
 2165,
 892,
 1977,
 608,
 5426,
 51,
 4134,
 1722,
 3817,
 4488,
 4237,
 2928,
 4150,
 4619,
 729,
 3836,
 3178,
 2509,
 944,
 3680,
 2892,
 1110,
 1781,
 40,
 532,
 111]

In [401]:
len(y_test)

275

In [402]:
len(wrong_observations)

46