# 00: Getting Familiar with DataSets

In [1]:
! ls -CF data

01-olympics.json		  04-derby.xlsx
02-freeland-README.md		  05-un-speech-README.md
02-freeland.xlsx		  05-un-speech.xlsx
03-question-parliament-README.md  06-blue-girl.xlsx
03-question-parliament.xlsx	  07-bimar/
04-derby-README.md		  README.md


In [2]:
!ls -CF data/07-bimar/

00.json  01.json  02.json  03.json  04.json


In [3]:
import pandas as pd
import numpy as np

## Loading Data

In [4]:
olympics = pd.read_json("data/01-olympics.json", lines=True)
free_land = pd.read_excel("data/02-freeland.xlsx", engine='openpyxl')
question_parliament = pd.read_excel("data/03-question-parliament.xlsx", engine='openpyxl')
derby = pd.read_excel("data/04-derby.xlsx", engine='openpyxl')
un_speech = pd.read_excel("data/05-un-speech.xlsx", engine='openpyxl')
blue_girl = pd.read_excel("data/06-blue-girl.xlsx", engine='openpyxl')

In [5]:
bimar = pd.concat([
    pd.read_json("data/07-bimar/01.json", lines=True),
    pd.read_json("data/07-bimar/02.json", lines=True),
    pd.read_json("data/07-bimar/03.json", lines=True),
    pd.read_json("data/07-bimar/04.json", lines=True),
])

## Cleaning Data

In [6]:
free_land = free_land.dropna(how='all')
question_parliament = question_parliament.dropna(how='all')
derby = derby.dropna(how='all')
un_speech = un_speech.dropna(how='all')
blue_girl = blue_girl.dropna(how='all')


bimar = bimar.drop_duplicates(subset=['id'])  # Remove duplicate twitts

## Describe Data

In [7]:
_ = pd.DataFrame(np.array([
    ['Freeland', free_land.size, free_land.columns.to_list()],
    ['Question Parliament', question_parliament.size, question_parliament.columns.to_list()],
    ['Derby', derby.size, derby.columns.to_list()],
    ['UN Speech', un_speech.size, un_speech.columns.to_list()],
    ['Blue Girl', blue_girl.size, blue_girl.columns.to_list()],
    ['Bimar', bimar.size, bimar.columns.to_list()],
    ['Olympics', olympics.size, olympics.columns.to_list()],
]), columns=["DataSet", "Rows", "Columns"], dtype=object)
_.Rows = _.Rows.apply(lambda x : "{:,}".format(x))

display(_)

  _ = pd.DataFrame(np.array([


Unnamed: 0,DataSet,Rows,Columns
0,Freeland,7562,"[Date, Screen Name, Full Name, Tweet Text, Twe..."
1,Question Parliament,113962,"[Date, Screen Name, Full Name, Tweet Text, Twe..."
2,Derby,56981,"[Date, Screen Name, Full Name, Tweet Text, Twe..."
3,UN Speech,28462,"[Date, Screen Name, Full Name, Tweet Text, Twe..."
4,Blue Girl,4354889,"[extracted_hashtag, follower_count, following_..."
5,Bimar,216183,"[created_at, id, id_str, text, source, truncat..."
6,Olympics,472006,"[created_at, id, id_str, text, source, truncat..."


#### Difference between Olympics and Bimar columns: 

In [8]:
display(bimar.columns)
list(set(bimar.columns.to_list()) - set(olympics.columns.to_list()))

Index(['created_at', 'id', 'id_str', 'text', 'source', 'truncated',
       'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str',
       'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place',
       'contributors', 'is_quote_status', 'retweet_count', 'favorite_count',
       'entities', 'favorited', 'retweeted', 'filter_level', 'lang',
       'timestamp_ms', 'retweeted_status', 'display_text_range',
       'extended_tweet', 'possibly_sensitive', 'extended_entities',
       'quoted_status_id', 'quoted_status_id_str', 'quoted_status'],
      dtype='object')

['display_text_range', 'extended_tweet']

#### Freeland, Question Parliament, Derby and UN Speech have same columns

In [9]:
display(free_land.columns)
_["Columns"][0] == _["Columns"][1] == _["Columns"][2] == _["Columns"][3]

Index(['Date', 'Screen Name', 'Full Name', 'Tweet Text', 'Tweet ID', 'Link(s)',
       'Media', 'Location', 'App', 'Followers', 'Follows', 'Listed', 'Verfied',
       'User Since', 'Location.1', 'Bio', 'Website', 'Timezone',
       'Profile Image'],
      dtype='object')

True

#### Blue Girl Columns

In [10]:
blue_girl.columns

Index(['extracted_hashtag', 'follower_count', 'following_count', 'is_quote',
       'lang', 'like_count', 'norm_description', 'post_count', 'time-date',
       'time-time', 'type'],
      dtype='object')

## Generating Collections
### 1. col1: Freeland, Question Parliament, Derby, UN Speech

In [11]:
col1 = pd.concat([free_land, question_parliament, derby, un_speech])
col1 = col1.drop_duplicates(subset=['Tweet ID'])  # Remove 200 duplicate tweets

##### Add labels based on the data origin

In [25]:
free_lancers = free_land['Screen Name'].unique()
soccer_lover = derby['Screen Name'].unique()
political_enthusiast = pd.concat([question_parliament, un_speech])['Screen Name'].unique()

In [26]:
col1.loc[col1['Screen Name'].isin(free_lancers), 'free lancer']=True
col1.loc[col1['Screen Name'].isin(soccer_lover), 'soccer lover']=True
col1.loc[col1['Screen Name'].isin(political_enthusiast), 'political enthusiast']=True

## Save Data

In [28]:
prefix = "00"

olympics.to_pickle(f'pickles/{prefix}-olympics.pkl')
blue_girl.to_pickle(f'pickles/{prefix}-bluegirl.pkl')
free_land.to_pickle(f'pickles/{prefix}-free_land.pkl')
question_parliament.to_pickle(f'pickles/{prefix}-question_parliament.pkl')
derby.to_pickle(f'pickles/{prefix}-derby.pkl')
un_speech.to_pickle(f'pickles/{prefix}-un_speech.pkl')
bimar.to_pickle(f'pickles/{prefix}-bimar.pkl')

col1.to_pickle(f'pickles/{prefix}-col1.pkl')

!ls pickles/00-*

pickles/00-bimar.pkl	 pickles/00-free_land.pkl
pickles/00-bluegirl.pkl  pickles/00-olympics.pkl
pickles/00-col1.pkl	 pickles/00-question_parliament.pkl
pickles/00-derby.pkl	 pickles/00-un_speech.pkl
