# 1- Load the required modules

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import sklearn.model_selection as cv
import sklearn
# You must install the library colour and colorclassifier:
# Run the command "pip install colour colorclassifier colormath==1.0.8
from colour import Color
from colorclassifier import Classifier
from datetime import datetime

# 2- Load csv file with preprocessed location

In [2]:
ENCODING = 'latin1'

# Read csv file with preprocessed locations
# The original locations were imputed to country locations
data = pd.read_csv("tweet_location_preprocessing/gender-classifier-tweet-location-preprocessed-0-18933.csv", encoding=ENCODING)

# In addition rows with gender == 'unknown' has been deleted
assert data[data.gender == 'unknown'].sum().sum() == 0

data.head()

Unnamed: 0.1,Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,gender,gender:confidence,profile_yn,profile_yn:confidence,...,profileimage,retweet_count,sidebar_color,text,tweet_coord,tweet_count,tweet_created,tweet_id,tweet_location,user_timezone
0,0,815719226,False,finalized,3,10/26/15 23:24,male,1.0,yes,1.0,...,https://pbs.twimg.com/profile_images/414342229...,0,FFFFFF,Robbie E Responds To Critics After Win Against...,,110964,10/26/15 12:40,6.5873e+17,Unknown,Chennai
1,1,815719227,False,finalized,3,10/26/15 23:30,male,1.0,yes,1.0,...,https://pbs.twimg.com/profile_images/539604221...,0,C0DEED,ÛÏIt felt like they were my friends and I was...,,7471,10/26/15 12:40,6.5873e+17,Unknown,Eastern Time (US & Canada)
2,2,815719228,False,finalized,3,10/26/15 23:33,male,0.6625,yes,1.0,...,https://pbs.twimg.com/profile_images/657330418...,1,C0DEED,i absolutely adore when louis starts the songs...,,5617,10/26/15 12:40,6.5873e+17,India,Belgrade
3,3,815719229,False,finalized,3,10/26/15 23:10,male,1.0,yes,1.0,...,https://pbs.twimg.com/profile_images/259703936...,0,C0DEED,Hi @JordanSpieth - Looking at the url - do you...,,1693,10/26/15 12:40,6.5873e+17,United States,Pacific Time (US & Canada)
4,4,815719230,False,finalized,3,10/27/15 1:15,female,1.0,yes,1.0,...,https://pbs.twimg.com/profile_images/564094871...,0,0,Watching Neighbours on Sky+ catching up with t...,,31462,10/26/15 12:40,6.5873e+17,Unknown,


# 3- Select attrs that will be used

In [3]:
# Drop attributes that we won't use
data = data.drop(['Unnamed: 0', '_unit_id', '_golden', '_unit_state', '_trusted_judgments', '_last_judgment_at', 'gender:confidence', 'profile_yn', 'profile_yn:confidence', 'gender_gold', 'profile_yn_gold', 'profileimage', 'tweet_id'], axis=1);
data.head()

Unnamed: 0,gender,created,description,fav_number,link_color,name,retweet_count,sidebar_color,text,tweet_coord,tweet_count,tweet_created,tweet_location,user_timezone
0,male,12/5/13 1:48,i sing my own rhythm.,0,08C2C2,sheezy0,0,FFFFFF,Robbie E Responds To Critics After Win Against...,,110964,10/26/15 12:40,Unknown,Chennai
1,male,10/1/12 13:51,I'm the author of novels filled with family dr...,68,0084B4,DavdBurnett,0,C0DEED,ÛÏIt felt like they were my friends and I was...,,7471,10/26/15 12:40,Unknown,Eastern Time (US & Canada)
2,male,11/28/14 11:30,louis whining and squealing and all,7696,ABB8C2,lwtprettylaugh,1,C0DEED,i absolutely adore when louis starts the songs...,,5617,10/26/15 12:40,India,Belgrade
3,male,6/11/09 22:39,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe...",202,0084B4,douggarland,0,C0DEED,Hi @JordanSpieth - Looking at the url - do you...,,1693,10/26/15 12:40,United States,Pacific Time (US & Canada)
4,female,4/16/14 13:23,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...,37318,3B94D9,WilfordGemma,0,0,Watching Neighbours on Sky+ catching up with t...,,31462,10/26/15 12:40,Unknown,


# 4- Preprocess fields with string values and transform coordinates to binary

In [4]:
# Description attr treatment. Change to number of words writed
for index, row in data.iterrows() :
    data.set_value(index, 'description', len(str(row['description'])))
    data.set_value(index, 'text', len(str(row['text'])))
    data.set_value(index, 'name', len(str(row['name'])))
    data.set_value(index, 'tweet_coord', "false" if pd.isnull(row['tweet_coord']) else "true")

data.rename(columns={'description': 'description_length', 'text': 'text_length', 'name': 'name_length'}, inplace=True)
data.head()

Unnamed: 0,gender,created,description_length,fav_number,link_color,name_length,retweet_count,sidebar_color,text_length,tweet_coord,tweet_count,tweet_created,tweet_location,user_timezone
0,male,12/5/13 1:48,21,0,08C2C2,7,0,FFFFFF,109,False,110964,10/26/15 12:40,Unknown,Chennai
1,male,10/1/12 13:51,62,68,0084B4,11,0,C0DEED,139,False,7471,10/26/15 12:40,Unknown,Eastern Time (US & Canada)
2,male,11/28/14 11:30,35,7696,ABB8C2,14,1,C0DEED,80,False,5617,10/26/15 12:40,India,Belgrade
3,male,6/11/09 22:39,146,202,0084B4,11,0,C0DEED,138,False,1693,10/26/15 12:40,United States,Pacific Time (US & Canada)
4,female,4/16/14 13:23,160,37318,3B94D9,12,0,0,95,False,31462,10/26/15 12:40,Unknown,


# 5- Change color values to color categories

In [5]:
def hexToColor(hex):
    l = len(hex)
    if (l > 6): hex = '0' * 6
    elif (l < 6): hex = hex + '0' * (6 - l)
    rgb = Color('#' + hex).rgb
    i = lambda f: int(round(f))
    return Classifier(rgb = [i(rgb[0]*255), i(rgb[1]*255), i(rgb[2]*255)]).get_name()

data.sidebar_color = data.sidebar_color.apply(hexToColor)
#Hay 14 colores distintos sin contar los valores del estilo "1.10E+17" para los que se ha puesto "0"
#Son los colores de la libreria Classifier, de los que se elige el más similar.
print("Sidebar colors:")
print(data.sidebar_color.unique())

data.link_color = data.link_color.apply(hexToColor)
print("Link colors:")
print(data.link_color.unique())

time = lambda t: 'Unknown' if pd.isnull(t) else t
data.user_timezone = data.user_timezone.apply(time)
data.head()

Sidebar colors:
['white' 'black' 'lightblue' 'orange' 'lightgreen' 'gray' 'lightred' 'cyan'
 'red' 'brown' 'blue' 'yellow' 'violet' 'green']
Link colors:
['cyan' 'gray' 'lightblue' 'lightred' 'blue' 'black' 'violet' 'red'
 'orange' 'brown' 'green' 'lightgreen' 'white' 'yellow']


Unnamed: 0,gender,created,description_length,fav_number,link_color,name_length,retweet_count,sidebar_color,text_length,tweet_coord,tweet_count,tweet_created,tweet_location,user_timezone
0,male,12/5/13 1:48,21,0,cyan,7,0,white,109,False,110964,10/26/15 12:40,Unknown,Chennai
1,male,10/1/12 13:51,62,68,gray,11,0,white,139,False,7471,10/26/15 12:40,Unknown,Eastern Time (US & Canada)
2,male,11/28/14 11:30,35,7696,gray,14,1,white,80,False,5617,10/26/15 12:40,India,Belgrade
3,male,6/11/09 22:39,146,202,gray,11,0,white,138,False,1693,10/26/15 12:40,United States,Pacific Time (US & Canada)
4,female,4/16/14 13:23,160,37318,lightblue,12,0,black,95,False,31462,10/26/15 12:40,Unknown,Unknown


# 6- Change create date values to categories

In [6]:
# Tuits are only created at midday (between 12h - 13h) probably the time when the tuits were obtained
def whichTimeRange(hour):
    assert hour >= 0 and hour < 24
    if (hour in range(1, 6)): return 'early_morning'
    if (hour in range(6, 12)): return 'morning'
    if (hour in range(12, 14)): return 'midday'
    if (hour in range(14, 21)): return 'afternoon'
    return 'evening' # 21 .. 0

def dateToRange(date):
    return whichTimeRange(datetime.strptime(date, '%m/%d/%y %H:%M').hour)

data.created = data.created.apply(dateToRange)
data.tweet_created = data.tweet_created.apply(dateToRange)

In [7]:
data.head()

Unnamed: 0,gender,created,description_length,fav_number,link_color,name_length,retweet_count,sidebar_color,text_length,tweet_coord,tweet_count,tweet_created,tweet_location,user_timezone
0,male,early_morning,21,0,cyan,7,0,white,109,False,110964,midday,Unknown,Chennai
1,male,midday,62,68,gray,11,0,white,139,False,7471,midday,Unknown,Eastern Time (US & Canada)
2,male,morning,35,7696,gray,14,1,white,80,False,5617,midday,India,Belgrade
3,male,evening,146,202,gray,11,0,white,138,False,1693,midday,United States,Pacific Time (US & Canada)
4,female,midday,160,37318,lightblue,12,0,black,95,False,31462,midday,Unknown,Unknown


# 7- Export CSV

In [8]:
data.to_csv('datapreprocessed.csv', encoding=ENCODING, index=False)