# Processing Twitter data

### A look at a single file

First we load a single file with 4000 tweets into a dataframe to check for nested dictionaries

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_json('tweets0.json')

First we define features that will go straight into the dataframe without any unpacking requiered

Sequentially unpack nested lists applying **pd.Series**

In [23]:
df_flat = pd.DataFrame()
# id and date
df_flat['id'] = df['id']
df_flat['date'] = df['created_at']
# user info
user_info = df['user'].apply(pd.Series)
df_flat['user'] = user_info.name
df_flat['user_name'] = user_info.screen_name
df_flat['place'] = user_info.location
# tweet info
df_flat['text'] = df['text']
df_flat['hashtags'] = df['entities'].apply(pd.Series).hashtags
df_flat['mentions'] = df['entities'].apply(pd.Series).user_mentions
df_flat['favorite_count'] = df['favorite_count']
df_flat['retweet_count'] = df['retweet_count']
# retweet user info
retweet_info = df['retweeted_status'].apply(pd.Series)
df_flat['retweet_user'] = retweet_info.user.apply(pd.Series).name
df_flat['retweet_user_name'] = retweet_info.user.apply(pd.Series).screen_name
# quoted user info
quoted_info = df['quoted_status'].apply(pd.Series)
df_flat['quoted_user'] = quoted_info.user.apply(pd.Series).name
df_flat['quoted_user_name'] = quoted_info.user.apply(pd.Series).screen_name
# reply user info
df_flat['reply_user_name'] = df['in_reply_to_screen_name']
df_flat.head()

  union = _union_indexes(indexes)
  result = result.union(other)
  result = result.union(other)
  union = _union_indexes(indexes)


Unnamed: 0,id,date,user,user_name,place,text,hashtags,mentions,favorite_count,retweet_count,retweet_user,retweet_user_name,quoted_user,quoted_user_name,reply_user_name
0,1070441726400233472,2018-12-05 22:16:10,jaybay,yungjaybay,"Knoxville, TN",RT @IWriteAllDay_: I read that Ariana Grande/ ...,[],"[{'screen_name': 'IWriteAllDay_', 'name': 'Cla...",0,1170,Clarkisha Kent,IWriteAllDay_,,,
1,1070441605730054144,2018-12-05 22:15:41,Jessi Styles💕,JessicakeliCP,"Rio de Janeiro, Brasil",RT @EsquadraoAriana: DOMINANDO TUDO!!\nKim Kar...,[],"[{'screen_name': 'EsquadraoAriana', 'name': 'E...",0,62,Esquadrão Ariana 💋,EsquadraoAriana,,,
2,1070441483554185217,2018-12-05 22:15:12,Rusty Goat,RustyGoat,Near Lubbock Texas,Ariana Grande showed off her new ink in a behi...,[],[],0,0,,,,,
3,1070441286832922625,2018-12-05 22:14:25,Gabi #NBLAH,mcyrusdope,"Belém, Brasil",RT @TheMusicManiacs: BEST New Music Video? 😍\n...,[],"[{'screen_name': 'TheMusicManiacs', 'name': 'M...",0,33,Music Mania 🎶,TheMusicManiacs,,,
4,1070441244965376000,2018-12-05 22:14:15,maria ortegon,mariaortegon12,"Lytle, TX",RT @people: Ariana Grande Sent Pete Davidson '...,[],"[{'screen_name': 'people', 'name': 'People', '...",0,4,People,people,,,


In [4]:
df_flat.to_csv('flat_tweets.csv')

### Looping through all files

In [3]:
import os, json

json_files = [pos_json for pos_json in os.listdir() if pos_json.endswith('.json')]
print(json_files[:3])

['tweets0.json', 'tweets1.json', 'tweets10.json']


In [4]:
df = pd.DataFrame()

for file in json_files:
    data = pd.read_json(file)
    df = df.append(data)

In [5]:
df.shape

(265810, 31)

In [6]:
df_flat = pd.DataFrame()
# id and date
df_flat['id'] = df['id']
df_flat['date'] = df['created_at']
# user info
user_info = df['user'].apply(pd.Series)
df_flat['user'] = user_info.name
df_flat['user_name'] = user_info.screen_name
df_flat['place'] = user_info.location
# tweet info
df_flat['text'] = df['text']
df_flat['hashtags'] = df['entities'].apply(pd.Series).hashtags
df_flat['mentions'] = df['entities'].apply(pd.Series).user_mentions
df_flat['favorite_count'] = df['favorite_count']
df_flat['retweet_count'] = df['retweet_count']
df_flat['language'] = df['lang']
# retweet user info
retweet_info = df['retweeted_status'].apply(pd.Series)
df_flat['retweet_user'] = retweet_info.user.apply(pd.Series).name
df_flat['retweet_user_name'] = retweet_info.user.apply(pd.Series).screen_name
# quoted user info
quoted_info = df['quoted_status'].apply(pd.Series)
df_flat['quoted_user'] = quoted_info.user.apply(pd.Series).name
df_flat['quoted_user_name'] = quoted_info.user.apply(pd.Series).screen_name
# reply user info
df_flat['reply_user_name'] = df['in_reply_to_screen_name']
df_flat.head()

  union = _union_indexes(indexes)
  result = result.union(other)
  union = _union_indexes(indexes)
  result = result.union(other)


Unnamed: 0,id,date,user,user_name,place,text,hashtags,mentions,favorite_count,retweet_count,language,retweet_user,retweet_user_name,quoted_user,quoted_user_name,reply_user_name
0,1070778907794792450,2018-12-06 20:36:00,Chris,happycamper712,"Oklahoma, USA",RT @amyklobuchar: The people spoke in this ele...,[],"[{'screen_name': 'amyklobuchar', 'name': 'Amy ...",0,1645,en,Amy Klobuchar,amyklobuchar,,,
1,1070778904640716800,2018-12-06 20:35:59,Apolline Déchamp,ApollineDchamp,,RT @CanadaFP: Canada announces support for #el...,"[{'text': 'elections', 'indices': [43, 53]}, {...","[{'screen_name': 'CanadaFP', 'name': 'Foreign ...",0,13,en,Foreign Policy CAN,CanadaFP,,,
2,1070778903877312514,2018-12-06 20:35:59,Darren Douglas,DazDouglas,"Tyne and Wear, UK",@theresa_may Pls pls pls be the prime minister...,[],"[{'screen_name': 'theresa_may', 'name': 'There...",0,0,en,,,,,theresa_may
3,1070778902929268736,2018-12-06 20:35:59,Damion Walls 🌊🌊,DanDautoplus,,RT @JuddLegum: Leslie McCrae Dowless ran an ou...,[],"[{'screen_name': 'JuddLegum', 'name': 'Judd Le...",0,303,en,Judd Legum,JuddLegum,,,
4,1070778901734088705,2018-12-06 20:35:58,Guilherme Tinoco,gtinocolh,BH e RJ,RT @_thalesnogueira: Ótimo artigo do Acemoglu ...,[],"[{'screen_name': '_thalesnogueira', 'name': 'T...",0,2,pt,Thales Nogueira,_thalesnogueira,,,


In [7]:
df_flat.columns

Index(['id', 'date', 'user', 'user_name', 'place', 'text', 'hashtags',
       'mentions', 'favorite_count', 'retweet_count', 'language',
       'retweet_user', 'retweet_user_name', 'quoted_user', 'quoted_user_name',
       'reply_user_name'],
      dtype='object')

In [8]:
df_flat.to_csv('all_tweets.csv')

# Exploring the dataset

In [9]:
# unique users
len(df_flat.user.unique())

129361

In [10]:
aggregation = {
    'retweet_count': 'sum',
    'id': 'count'
}

A look at the users with the most retweets

In [11]:
df_flat[df_flat.retweet_user.isna()].groupby(by = 'user_name').agg(aggregation).sort_values('retweet_count',ascending=False).head()

Unnamed: 0_level_0,retweet_count,id
user_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AriBerman,14503,6
paulkrugman,8884,2
AdamSchiff,6872,1
nhannahjones,5134,3
davidaxelrod,4813,1


A look at the users that retweeted the most

In [12]:
print(min(df_flat.date))
print(max(df_flat.date))

2018-12-05 06:36:02
2018-12-06 20:36:00


In [13]:
df_flat[df_flat.retweet_user.notna()].groupby(by = 'user_name').agg({'id':'count'}).sort_values('id',ascending=False).head()

Unnamed: 0_level_0,id
user_name,Unnamed: 1_level_1
TOKYO_DEMOCRACY,320
Vox_Democracy,226
Sajawal70014704,225
const_democracy,214
aqeelgazianiri1,183
