In [22]:
# libraries

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.api import OLS

from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from pandas.plotting import scatter_matrix

import seaborn as sns

from datetime import datetime

%matplotlib inline

In [23]:
df = pd.read_csv("summarize_accounts.csv",  encoding='utf-8')

df.head()

Unnamed: 0_level_0,label,count_tweets,count_words,count_truncated,count_in_reply_to_status_id,count_in_reply_to_user_id,count_in_reply_to_screen_name,count_retweeted_status_id,count_place,sum_retweet_count,sum_reply_count,sum_favorite_count,count_favorited,count_retweeted,possibly_sensitive,sum_num_hastags,sum_num_urls,sum_num_mentions,max_created_at,min_created
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
467203615,social_spambots_1,1280,19395,0,0,0,0,4,0,991,0,307,0,0,1240,100,33,4,2014-11-12T22:31:06,2014-05-19T19:55:39
545309765,social_spambots_1,1280,19305,0,0,0,0,9,0,1709,0,453,0,0,1246,122,25,14,2014-11-13T08:58:07,2014-05-27T10:12:33
21706899,fake_followers,1,2,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,2009-02-23T23:26:02,2009-02-23T23:26:02
34461690,fake_followers,1,5,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,2009-04-23T00:33:09,2009-04-23T00:33:09
38021637,fake_followers,1,16,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,2009-05-05T21:19:25,2009-05-05T21:19:25


In [24]:
df['label'].unique()

array(['social_spambots_1', 'fake_followers', 'genuine',
       'social_spambots_3', 'social_spambots_2'], dtype=object)

In [25]:
df.columns.values

array(['label', 'count_tweets', 'count_words', 'count_truncated',
       'count_in_reply_to_status_id', 'count_in_reply_to_user_id',
       'count_in_reply_to_screen_name', 'count_retweeted_status_id',
       'count_place', 'sum_retweet_count', 'sum_reply_count',
       'sum_favorite_count', 'count_favorited', 'count_retweeted',
       'possibly_sensitive', 'sum_num_hastags', 'sum_num_urls',
       'sum_num_mentions', 'max_created_at', 'min_created'], dtype=object)

In [26]:
df['words_per_tweet'] = df['count_words']/df['count_tweets']

In [27]:
# nothing in these column(s)

columns = ['count_truncated', 'count_favorited']

df = df.drop(columns, axis = 1)

In [28]:
df['retweets_per_tweet'] = df['sum_retweet_count']/df['count_tweets']

In [29]:
df['reply_per_tweet'] = df['sum_reply_count']/df['count_tweets']

In [30]:
df['favcounts_per_tweet'] = df['sum_favorite_count']/df['count_tweets']

In [31]:
df['percent_sensitive_tweets'] = (df['possibly_sensitive']/df['count_tweets'])*100

In [32]:
df['hashtags_per_tweet'] = df['sum_num_hastags']/df['count_tweets']

In [33]:
df['urls_per_tweet'] = df['sum_num_urls']/df['count_tweets']

In [34]:
df['mentions_per_tweet'] = df['sum_num_mentions']/df['count_tweets']

In [35]:
# SHOULD WE GET RID OF THESE? 

columns = ['min_created', 'max_created_at']

df = df.drop(columns, axis = 1)

In [36]:
# show proportions

spambot1 = df[(df['label'] == 'social_spambots_1')]
spambot2 = df[(df['label'] == 'social_spambots_2')]
spambot3 = df[(df['label'] == 'social_spambots_3')]
fakefollowers = df[(df['label'] == 'fake_followers')]
realusers = df[(df['label'] == 'genuine')]

print('total:',(len(df['label'])))
print(spambot1.shape)
print(spambot2.shape)
print(spambot3.shape)
print(fakefollowers.shape)
print(realusers.shape)

total: 9197
(991, 24)
(3457, 24)
(464, 24)
(3202, 24)
(1083, 24)


In [37]:
df = df.drop('count_retweeted', axis = 1)

## Explaination of some columns:

https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object.html

in_reply_to_status_id = " If the represented Tweet is a reply, this field will contain the integer representation of the original Tweet’s ID"

in_reply_to_user_id	= "If the represented Tweet is a reply, this field will contain the integer representation of the original Tweet’s author ID. This will not necessarily always be the user directly mentioned in the Tweet."

in_reply_to_screen_name	= "If the represented Tweet is a reply, this field will contain the screen name of the original Tweet’s author."

In [38]:
# splitting into test/traindf

traindf, testdf = train_test_split(df, test_size = 0.25, random_state = 42)

train_data = traindf
test_data = testdf

In [39]:
print('training data size:', train_data.shape)
print('test data size:', test_data.shape)

training data size: (6897, 23)
test data size: (2300, 23)


In [None]:
# scatter matrix of training data

sns.set(style="ticks")
sns.pairplot(traindf)