In [29]:
import requests
import os
import json
import pandas as pd
import numpy as np
import pickle as pkl
import TwitterUtils as TU
import seaborn as sns
import re

import spacy
import spacy_langdetect as sld

## Combining Data

The `GetPlaces.py`, `GetTweets.py`, and `SampleUser.py` files have generated the following output files:
* users.json : contains all the user specific data from the 51,000 sampled users
* places.pkl : metadata related to all twitter places in the user sample
* tweets.pkl : retrieved 100 tweets from each user

In [30]:
URL_regex = 'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)'
twitter_username_re = '(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)'
twitter_username_re = r"((^|[^@\w])@(\w{1,15})\b)*"

In [31]:
with open("places.pkl", "rb") as file: # Unique Places
    places = pkl.load(file)

with open('tweets.pkl', 'rb') as file: # Rename to Tweets, user_id, place_id
    data = pkl.load(file)

with open('users.json', 'r') as file: # Actual data for user accounts
    user_json = file.read()

test = '{"total": [' + user_json.replace("}{", "},{") + "]}"
user_data = json.loads(test)
users = [u['data'] for u in user_data["total"]]
flat_list = [user_id for user in users for user_id in user]

users_df = pd.DataFrame(flat_list).rename(columns = {'name' : 'user_name_field'})

In [32]:
places_unpacked = [item  for item in places.values()]
def unpack_place(place):
    return (place.id, place.name, place.full_name, place.country, place.country_code, place.place_type)

unpacked_places = [unpack_place(place) for place in places_unpacked]
place_df = pd.DataFrame(unpacked_places, columns = ("id", "name", "full_name", "country", "country_code", "type"))

URL_regex = 'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'

In [5]:
URL_regex = r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)'

In [35]:
df = pd.DataFrame(data)
merged = pd.merge(df, place_df, how = 'left', left_on = 'place_id', right_on='id')
full_data = pd.merge(merged, users_df, how='left', left_on ='user_id', right_on = 'id')
# full_data.to_csv('fulldata.csv', index = False) # 3 Gigs of data, not great...
# full_data.description.iloc[0,]
# places_unpacked # Cool opportunity for geographic visualization

In [36]:
len(full_data['user_id'].unique())

30470

In [37]:
# Limit to top 6 countries
by_country = merged.groupby('country').count()
top6 = by_country.sort_values(by = 'user_id', ascending=False).head(6)
total = by_country.user_id.sum()
top6.user_id.divide(total).sum()

0.9066735266648764

In [38]:
full_data.columns

Index(['user_id', 'tweet_id', 'tweet_text', 'place_id', 'id_x', 'name',
       'full_name', 'country', 'country_code', 'type', 'username',
       'description', 'id_y', 'user_name_field', 'location', 'withheld'],
      dtype='object')

The top 6 countries account for 91% of the total users collected, which suggests pretty good coverage. Dropping unnecessary columns

Rename Columns

In [39]:
target_fields = ['user_id', 'tweet_id', 'tweet_text', 'place_id', 'name',
       'full_name', 'country', 'country_code', 'type', 'username',
       'description', 'user_name_field', 'location']
reduced_df = full_data[target_fields]
reduced_df = reduced_df.rename(columns={'name':'place_name', 
                                        'full_name':'full_place_name',
                                        'type': 'place_type', 
                                        'description':'profile_description',
                                        'user_name_field':'profile_name',
                                        'location':'profile_location'
                                        })

Drop Duplicates

In [40]:
top6_countries = top6.index
top6 = reduced_df[reduced_df['country'].isin(top6_countries)]
unique_tweets = top6['tweet_id'].unique()
top6 = top6.drop_duplicates('tweet_id')

Detect languages

In [41]:
def get_lang_detector(nlp, name):
    return sld.LanguageDetector()

# Uncomment when running for first time
nlp = spacy.load("en_core_web_sm")
spacy.Language.factory('language_detector', func = get_lang_detector)
nlp.add_pipe('language_detector', last =True)

def get_language(text):
    return nlp(text)._.language['score']

In [48]:
get_language(top6.tweet_text.iloc[0])

0.9999960942533268

In [49]:
data_dict = top6.to_dict(orient='records')
langs = [get_language(d['tweet_text']) for d in data_dict]

## Clean Tweet Texts

In [63]:
text_test = top6.tweet_text.iloc[1]
text_test

'@Gajendr70729189 @amitsharma2704 @1shankarsharma Including my SAP technology business.  Thank you. Namaste.'

In [66]:
re.sub(URL_regex, "", text_test)
re.sub(twitter_username_re, "", text_test)

' Including my SAP technology business.  Thank you. Namaste.'

In [42]:
def clean_text(text):
    temp = re.sub(URL_regex, "", text)

    return re.sub(twitter_username_re, "", text)

top6['clean_text'] = top6.tweet_text.apply(clean_text)

In [44]:
top6 = top6.drop_duplicates('tweet_id')
top6.shape

(859183, 14)

In [45]:
top6.to_csv('filtered_data.csv', index=False)