In [1]:
import pandas as pd
import pymysql
from sqlalchemy import create_engine

### Creating conexion string to read the data base.

In [2]:
sqlitedb_rel_path = '../data/raw_data_project_m1.db'
conn_str = f'sqlite:///{sqlitedb_rel_path}'
engine = create_engine(conn_str)

### Showing the tables of the data base.

In [3]:
df = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table'", engine)

### Cleaning data of personal_info table
- Age column: we need to have the same format: XX years old.
- Gender column: we need Female / Male format.
- Dem_has_children column: Yes / No format.
- Age_group: there are some elements out of the three bins.

In [4]:
df_personal_info = pd.read_sql_query("SELECT * FROM personal_info", engine)

In [5]:
#Age: algunos tienen el año en lugar de years old.
#puedo filtrar todos aquellos valores que no contengan years old
df_personal_info['age']

0       61 years old
1       57 years old
2       32 years old
3       45 years old
4       41 years old
            ...     
9644    37 years old
9645    53 years old
9646            1992
9647    47 years old
9648    51 years old
Name: age, Length: 9649, dtype: object

In [6]:
#Gender: Female, Male.
df_personal_info['gender'] = df_personal_info['gender'].astype("string").str.capitalize()
df_personal_info['gender'] = df_personal_info['gender'].replace('Fem', 'Female')

In [7]:
#Chilren: Yes or No.
df_personal_info['dem_has_children'] = df_personal_info['dem_has_children'].str.capitalize()

In [8]:
# - Age group: todo mismo formato. 
df_personal_info['age_group'].unique()

array(['40_65', '26_39', 'juvenile', '14_25'], dtype=object)

In [9]:
#Hacemos filtro para ver mejor los elemntos correspondientes a 'juvenile'. Max y min pertenecen al bin 14-25, 
#porque 2016-1991 = 25 y 2016-2002=14, por lo que entendemos que todos pertecen a la msima categoría.
juvenile_filter = df_personal_info['age_group'].str.contains("juvenile")
print(df_personal_info[juvenile_filter].max())
print(df_personal_info[juvenile_filter].min())

uuid                fee33680-da56-0133-c1d5-0a81e8b09a82
age                                                 2002
gender                                              Male
dem_has_children                                     Yes
age_group                                       juvenile
dtype: object
uuid                00199e00-d8c3-0133-c6b2-0a81e8b09a82
age                                                 1991
gender                                            Female
dem_has_children                                      No
age_group                                       juvenile
dtype: object


In [10]:
df_personal_info['age_group'] = df_personal_info['age_group'].replace('juvenile', '14_25')

In [17]:
df_personal_info['age'] = df_personal_info['age'].astype("string")
df_personal_info

Unnamed: 0,uuid,age,gender,dem_has_children,age_group
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,61 years old,Male,No,40_65
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,57 years old,Male,Yes,40_65
2,83127080-da3d-0133-c74f-0a81e8b09a82,32 years old,Male,No,26_39
3,15626d40-db13-0133-ea5c-0a81e8b09a82,45 years old,Male,Yes,40_65
4,24954a70-db98-0133-4a64-0a81e8b09a82,41 years old,Female,Yes,40_65
...,...,...,...,...,...
9644,7d1ac020-dcb4-0133-817a-0a81e8b09a82,37 years old,Female,No,26_39
9645,39f989f0-db52-0133-8482-0a81e8b09a82,53 years old,Male,Yes,40_65
9646,70ce4a90-d965-0133-f5e4-0a81e8b09a82,1992,Male,No,14_25
9647,2896e440-db3c-0133-5b67-0a81e8b09a82,47 years old,Male,Yes,40_65


In [18]:
df_personal_info['age'] = df_personal_info['age'].str.replace(' years old', '')

In [30]:
for x in range(1980, 2050):
    df_personal_info['age'] = df_personal_info['age'].str.replace(f'{x}', f'{2016-x}')

In [32]:
df_personal_info['age'] = df_personal_info['age'].astype('int64')

Unnamed: 0,uuid,age,gender,dem_has_children,age_group
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,61,Male,No,40_65
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,57,Male,Yes,40_65
2,83127080-da3d-0133-c74f-0a81e8b09a82,32,Male,No,26_39
3,15626d40-db13-0133-ea5c-0a81e8b09a82,45,Male,Yes,40_65
4,24954a70-db98-0133-4a64-0a81e8b09a82,41,Female,Yes,40_65
...,...,...,...,...,...
9644,7d1ac020-dcb4-0133-817a-0a81e8b09a82,37,Female,No,26_39
9645,39f989f0-db52-0133-8482-0a81e8b09a82,53,Male,Yes,40_65
9646,70ce4a90-d965-0133-f5e4-0a81e8b09a82,24,Male,No,14_25
9647,2896e440-db3c-0133-5b67-0a81e8b09a82,47,Male,Yes,40_65


In [14]:
df_country_info = pd.read_sql_query("SELECT * FROM country_info", engine)
df_country_info['rural'] = df_country_info['rural'].str.lower()

In [15]:
df_career_info = pd.read_sql_query("SELECT * FROM career_info", engine)
df_career_info['dem_education_level'] = df_career_info['dem_education_level'].replace('no', 'no education')
df_career_info['dem_education_level'] = df_career_info['dem_education_level'].fillna('no education')

In [16]:
df_poll_info = pd.read_sql_query("SELECT * FROM poll_info", engine)

In [46]:
dfs_list = [df_personal_info, df_country_info, df_career_info, df_poll_info]

In [47]:
from functools import reduce
df_final = reduce(lambda left, right: pd.merge(left , right , on='uuid'), dfs_list)

In [48]:
df_final.head()

Unnamed: 0,uuid,age,gender,dem_has_children,age_group,country_code,rural,dem_education_level,dem_full_time_job,normalized_job_code,question_bbi_2016wave4_basicincome_awareness,question_bbi_2016wave4_basicincome_vote,question_bbi_2016wave4_basicincome_effect,question_bbi_2016wave4_basicincome_argumentsfor,question_bbi_2016wave4_basicincome_argumentsagainst
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,61,Male,No,40_65,AT,countryside,no education,no,,I know something about it,I would not vote,None of the above,None of the above,None of the above
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,57,Male,Yes,40_65,AT,urban,high,yes,861a9b9151e11362eb3c77ca914172d0,I understand it fully,I would probably vote for it,A basic income would not affect my work choices,It increases appreciation for household work a...,It might encourage people to stop working
2,83127080-da3d-0133-c74f-0a81e8b09a82,32,Male,No,26_39,AT,city,no education,no,,I have heard just a little about it,I would not vote,‰Û_ gain additional skills,It creates more equality of opportunity,Foreigners might come to my country and take a...
3,15626d40-db13-0133-ea5c-0a81e8b09a82,45,Male,Yes,40_65,AT,country,high,yes,049a3f3a2b5f85cb2971ba77ad66e10c,I have heard just a little about it,I would probably vote for it,‰Û_ work less,It reduces anxiety about financing basic needs,None of the above
4,24954a70-db98-0133-4a64-0a81e8b09a82,41,Female,Yes,40_65,AT,city,high,yes,f4b2fb1aa40f661488e2782b6d57ad2f,I have heard just a little about it,I would probably vote for it,None of the above,It reduces anxiety about financing basic needs,It is impossible to finance | It might encoura...
