** Exercice: **

Take countries_GDP.csv and convert it to a valid csv file without header.

In [64]:
import pandas as pd
import numpy as np

#read csv as data frame
df_gdp_raw = pd.read_csv("../data/countries_GDP.csv")
#select columns and use these that have data in 'Unamed:0', which
#actually is the country code
df_gdp = df_gdp_raw[[0,1,3,4]][df_gdp_raw['Unnamed: 0'].notnull()]
#rename columns and index
df_gdp.columns=["country_code","position","country_name","gdp"]
df_gdp.index = range(df_gdp.shape[0])
#show head
print df_gdp.head()
#show types, take into account that gdp should be integer
print df_gdp.dtypes
#change gdp dtype to numeric
df_gdp.gdp = df_gdp.gdp.apply(lambda x: x.replace(",","").strip(" "))
df_gdp.gdp = pd.to_numeric(df_gdp.gdp,errors="coerce")
print df_gdp.gdp.dtype
print df_gdp.head()
print df_gdp.tail()
#save as csv, set header as false
df_gdp.to_csv("../data/countries_GDP_clean.csv",header=False)

  country_code position    country_name           gdp
0          USA        1   United States   17,946,996 
1          CHN        2           China   10,866,444 
2          JPN        3           Japan    4,123,258 
3          DEU        4         Germany    3,355,772 
4          GBR        5  United Kingdom    2,848,755 
country_code    object
position        object
country_name    object
gdp             object
dtype: object
float64
  country_code position    country_name         gdp
0          USA        1   United States  17946996.0
1          CHN        2           China  10866444.0
2          JPN        3           Japan   4123258.0
3          DEU        4         Germany   3355772.0
4          GBR        5  United Kingdom   2848755.0
    country_code position         country_name         gdp
224          SSF      NaN   Sub-Saharan Africa   1572873.0
225          LIC      NaN           Low income    392904.0
226          LMC      NaN  Lower middle income   5820363.0
227          U

**Exercice:**

Clean countries data and save it as a valid csv without header.

In [68]:
import pandas as pd
import numpy as np

df_country_raw = pd.read_csv("../data/countries_data.csv",sep=";")
df_country_raw.head(15)
df_country_raw.to_csv("../data/countries_data_clean.csv",header=False)

**Exercice:**

Build a function that generates a dataframe with N user id plus a list of a random number of random news topics from news_topics.csv

In [299]:
import pandas as pd
import numpy as np


def generate_users_df(num_users, num_topics):
    #generate num_users usernames
    usernames_df = pd.Series(["user"]*num_users).str.cat(pd.Series(np.arange(num_users)).map(str))

    #read topics csv
    news_topics = pd.read_csv("../data/news_topics.csv",header=None)
    #generate a list of N int picked uniformly random from range 0 .. num_topics
    #WARNING: is really an uniform distribution??
    rand_ints = pd.Series(np.random.randint(1,num_topics+1,num_users))

    #WARNING: what happens if x>len(news_topics)
    topics_df = rand_ints.apply(lambda x: "|".join(np.random.choice(news_topics.T[0],x,replace=False)))

    return pd.concat({'username':usernames_df,'topics':topics_df},axis=1)
    
M = 5
N = 100
users_df = generate_users_df(N,M)
users_df.head(10)

Unnamed: 0,topics,username
0,world|healthcare|comedians|directors,user0
1,economists|studios|video games,user1
2,automotive|food|movies,user2
3,reporters,user3
4,us|video games|environment|banks|basketball,user4
5,books|ncaa|theatre,user5
6,grammys|horses|sportscasters|immigration|children,user6
7,elections|banks|immigration,user7
8,energy|artists,user8
9,figure skating|energy,user9


** Exercice: ** 

Save the info generated with the previous function as csv so that it can be easily loaded as a Pair RDD in pyspark.

In [300]:
import csv 

M = 20
N = 1000
users_df = generate_users_df(N,M)

users_df.to_csv("../data/users_events_example/user_info_%susers_%stopics.csv" % (N,M),
                columns=["username","topics"],
                header=None, 
                index=None)
                #quoting=csv.QUOTE_MINIMAL)

** Exercice: ** 

Build a function that generates N csv files containing user's web browsing information. This function takes a max number of users M (from user0 to userM) and generates K user information logs for a randomly picked user (with repetition). The function will return this information with a timestamp. Each file represents 5 minute activity, the activity period will be K/300. The activity information is a random selection of 1 element over news topics.

In [301]:
import datetime

def generate_user_events(date_start, num_files, num_users, num_events):
    #generate usernames
    usernames_df = pd.Series(["user"]*num_users).str.cat(pd.Series(np.arange(num_users)).map(str))
    #read topics
    news_topics = pd.read_csv("../data/news_topics.csv",header=None,lineterminator="\n").T
    #create time index
    df_index = pd.date_range(date_start, 
                             periods=num_events, 
                             freq=pd.DateOffset(seconds=float(5*60)/num_events))
    #generate data
    event_data = {"user" : np.random.choice(usernames_df,num_events,replace=True),
                  "event" : np.random.choice(news_topics[0],num_events,replace=True)}
    #generate df
    return pd.DataFrame(event_data, index = df_index, columns=["user", "event"])

num_files = 10
num_users = 100
num_events = 1000
date_start = datetime.datetime.strptime('1/1/2016', '%d/%m/%Y')

for idx,i in enumerate(range(num_files)):
    print "File ",idx+1," of ", num_files, " at ",date_start
    userevent_df = generate_user_events(date_start, num_files, num_users, num_events)
    file_name = "../data/users_events_example/userevents_" + date_start.strftime("%d%m%Y%H%M%S") + ".log"
    userevent_df.to_csv(file_name, header=None)
    date_start = date_start + datetime.timedelta(0,300)

File  1  of  10  at  2016-01-01 00:00:00
File  2  of  10  at  2016-01-01 00:05:00
File  3  of  10  at  2016-01-01 00:10:00
File  4  of  10  at  2016-01-01 00:15:00
File  5  of  10  at  2016-01-01 00:20:00
File  6  of  10  at  2016-01-01 00:25:00
File  7  of  10  at  2016-01-01 00:30:00
File  8  of  10  at  2016-01-01 00:35:00
File  9  of  10  at  2016-01-01 00:40:00
File  10  of  10  at  2016-01-01 00:45:00


In [297]:
fi = file("../data/users_events_example/user_info_1000users_20topics.csv","r")
filc = fi.readlines()
for l in filc:
    print l
    print (l.split(",")[0],l.split(",")[1].split("|"))

user0,basketball|banks|books|world|us|real estate|automotive|comics|us|companies|disasters|hockey|soccer|horses|style|energy|soccer

('user0', ['basketball', 'banks', 'books', 'world', 'us', 'real estate', 'automotive', 'comics', 'us', 'companies', 'disasters', 'hockey', 'soccer', 'horses', 'style\r', 'energy', 'soccer\n'])
user1,companies

('user1', ['companies\n'])
user2,environment|music|banks|oscars|grammys|people|people|stock exchange|studios|CEOs|finance|boxing|grammys|producers|boxing|hockey|basketball|figure skating|taxes|sportscasters

('user2', ['environment', 'music', 'banks', 'oscars', 'grammys', 'people', 'people', 'stock exchange', 'studios', 'CEOs', 'finance', 'boxing', 'grammys', 'producers', 'boxing', 'hockey', 'basketball', 'figure skating', 'taxes', 'sportscasters\n'])
user3,theatre|olympics|tennis|world|energy|stock exchange|actors|golf|taxes|tv|style|banks|grammys|weather|CEOs|figure skating|baseball|food|technology|video games

('user3', ['theatre', 'olympics', 