# Data Cleaning : students / mentor seekers from Chile

In [1]:
#1) importing libraries for data cleaning, analysis and exploration

In [2]:
import pandas as pd 

In [3]:
#2) import dataset obtained from the survey in GoogleForm converted into .csv format

In [4]:
df1 = pd.read_csv('mentees_esp.csv')

In [5]:
#3) overview of data,n° of columns, rows and type of variables to evaluate what data types will be handled

In [6]:
df1.head(2)

Unnamed: 0,Timestamp,intelligence_type,mentor_attribute,mentor_rol,mentor_career,mentor_skills,session_freq,mentor_age,mentor_gender,mentee_age,mentee_gender
0,2021/06/16 8:42:43 am EET,Inteligencia Existencial: ¿Te intriga el senti...,Calma,Ofrecer orientación;Entregar apoyo en la toma ...,Tecnología de la Información (TI),Conocimientos especializados en el área selecc...,Dos veces al mes (15 - 30 minutos de sesión),No es relevante,No es relevante,25,Hombre
1,2021/06/21 5:22:07 pm EET,Inteligencia Natural: ¿Te llama la atención la...,Empatía,Ofrecer orientación;Entregar apoyo en la toma ...,Arte,Habilidad para enseñar,Dos veces al mes (15 - 30 minutos de sesión),23-29,No es relevante,22,Hombre


In [7]:
#3.a) proof of data types and check for null values

In [8]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Timestamp          64 non-null     object
 1   intelligence_type  64 non-null     object
 2   mentor_attribute   64 non-null     object
 3   mentor_rol         64 non-null     object
 4   mentor_career      64 non-null     object
 5   mentor_skills      64 non-null     object
 6   session_freq       64 non-null     object
 7   mentor_age         64 non-null     object
 8   mentor_gender      64 non-null     object
 9   mentee_age         64 non-null     object
 10  mentee_gender      64 non-null     object
dtypes: object(11)
memory usage: 5.6+ KB


Now we check most repeated values, unique values, among others. For instance its possible to see from ".describe" that there are 41 intelligence types, which is certainly not correct as we worked with only 9, however the user inputs represent 41 probable combination of intelligence type. This give us insights about what data must be handled and how. 

In [9]:
df1.describe()

Unnamed: 0,Timestamp,intelligence_type,mentor_attribute,mentor_rol,mentor_career,mentor_skills,session_freq,mentor_age,mentor_gender,mentee_age,mentee_gender
count,64,64,64,64,64,64,64,64,64,64,64
unique,64,41,5,15,10,5,4,4,3,10,2
top,2021/06/21 10:29:51 pm EET,Inteligencia Intrapersonal: ¿Reflexionas frecu...,Empatía,Ofrecer orientación;Tener disponibilidad para ...,Medicina,Habilidad para enseñar,Una vez por semana (sesión de 15 a 30 minutos),No es relevante,No es relevante,19,Hombre
freq,1,3,20,11,19,36,39,40,61,19,36


In [10]:
df1.isnull() # evaluate if there are null values in the data

Unnamed: 0,Timestamp,intelligence_type,mentor_attribute,mentor_rol,mentor_career,mentor_skills,session_freq,mentor_age,mentor_gender,mentee_age,mentee_gender
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
59,False,False,False,False,False,False,False,False,False,False,False
60,False,False,False,False,False,False,False,False,False,False,False
61,False,False,False,False,False,False,False,False,False,False,False
62,False,False,False,False,False,False,False,False,False,False,False


# it must be said that it does not make much sense to explore data which is still not translated and cleaned. It is important though, to know if there are any unexpected strings or inputs which are stored for instance asor unrecogniyed values. Thats why a preliminar exploration must be done but detailed analysis will be carried on  the final data set 

# 5 Data cleaning Intelligence Types - converting the values into a list and separate each input by columns ( users were allowed to give up to three inputs for intelligence types)

In [11]:
# convert whole dataframe to lower_case
df1 = df1.applymap(lambda s:s.lower() if type(s) == str else s)

In [12]:
# Separate intelligence types into three columns based on ";"
df2 = df1['intelligence_type'].str.split(';', expand=True) 
df2.head(2)

Unnamed: 0,0,1,2
0,inteligencia existencial: ¿te intriga el senti...,inteligencia interpersonal: ¿te gusta estar co...,inteligencia intrapersonal: ¿reflexionas frecu...
1,inteligencia natural: ¿te llama la atención la...,inteligencia musical: ¿te sientes atraído por ...,inteligencia intrapersonal: ¿reflexionas frecu...


 # general values are defined across all dataframes for replacing the whole text string by intelligence type to ensure uniform data across the 3 datasets - the chosen variables will be single words that represent the intelligence type, e.g. "natural", "musical" and so on.

In [13]:
df3 = df2.replace(regex={ r'^inteligencia natural: ¿te llama la atención la naturaleza y sientes que te relaja?.$': 'natural',
                         r'^inteligencia musical: ¿te sientes atraído por la música y por entenderla & crearla?.$': 'musical',
                         r'^inteligencia matemática: ¿te llama la atención la simetría de las cosas, unir diferentes elementos o los juegos de estrategia?.$': 'mathematical',
                         r'^inteligencia existencial: ¿te intriga el sentido de la vida o saber que más hay allá en el universo?.$': 'existencial',
                         r'^inteligencia interpersonal: ¿te gusta estar con gente e interactuar con diferentes personas?.$': 'interpersonal',
                         r'^inteligencia corporal-kinestésica: ¿amas los deportes o crear cosas con tus manos?.$': 'corporal',
                         r'^inteligencia lingüística: ¿te gustan los idiomas, los juegos de palabras o contar historias?.$': 'linguistic',
                         r'^inteligencia visual-espacial: ¿te interesan laberintos, rompecabezas o puedes visualizar el resultado de algo antes de hacerlo?.$': 'visual_spacial',
                         r'^inteligencia intrapersonal: ¿reflexionas frecuentemente sobre tus emociones? ¿te gusta planear cuidadosamente tus metas?.$': 'intrapersonal',
                        })

In [14]:
#renaming columns defining "itype" as equivalent tointelligence type
df3.rename(columns={0: "itype1", 1: "itype2", 2: 'itype3'}, inplace=True)

In [15]:
df3.fillna(0) # search for empty values and fill replace these with 0

Unnamed: 0,itype1,itype2,itype3
0,existencial,interpersonal,inteligencia intrapersonal: ¿reflexionas frecu...
1,natural,musical,inteligencia intrapersonal: ¿reflexionas frecu...
2,musical,existencial,interpersonal
3,musical,mathematical,existencial
4,existencial,0,0
...,...,...,...
59,natural,mathematical,inteligencia intrapersonal: ¿reflexionas frecu...
60,musical,mathematical,corporal
61,musical,mathematical,corporal
62,mathematical,existencial,visual_spacial


# somehow working with REGEX was fine for all of the data types from intelligence types but it did not work for intrapersonal. 
# Therefore this was a way around  using "string contains" to force the change and have cleaned data.

#locate intrapersonal and change all values where Intelligence intrapersonal is true to only intrapersonal

In [16]:
df3.loc[df3['itype1'].str.contains('intrapersonal') ==True, 'itype1'] = 'intrapersonal'
df3.loc[df3['itype2'].str.contains('intrapersonal') ==True, 'itype2'] = 'intrapersonal'
df3.loc[df3['itype3'].str.contains('intrapersonal') ==True, 'itype3'] = 'intrapersonal'

# this is the new dataframe for intelligence types which will be concatenated to the main one

In [17]:
df3.head()

Unnamed: 0,itype1,itype2,itype3
0,existencial,interpersonal,intrapersonal
1,natural,musical,intrapersonal
2,musical,existencial,interpersonal
3,musical,mathematical,existencial
4,existencial,,


In [18]:
#concatanate both dataframes

In [19]:
df4 = pd.concat([df1, df3], axis=1) 

In [20]:
# now we can drop columns that are not needed (timestamp, and also intelligence_type as now we have 3 columns itype1,2,3)
df4.drop(['Timestamp', 'intelligence_type'], axis=1, inplace=True)

In [21]:
df4.head(2)

Unnamed: 0,mentor_attribute,mentor_rol,mentor_career,mentor_skills,session_freq,mentor_age,mentor_gender,mentee_age,mentee_gender,itype1,itype2,itype3
0,calma,ofrecer orientación;entregar apoyo en la toma ...,tecnología de la información (ti),conocimientos especializados en el área selecc...,dos veces al mes (15 - 30 minutos de sesión),no es relevante,no es relevante,25,hombre,existencial,interpersonal,intrapersonal
1,empatía,ofrecer orientación;entregar apoyo en la toma ...,arte,habilidad para enseñar,dos veces al mes (15 - 30 minutos de sesión),23-29,no es relevante,22,hombre,natural,musical,intrapersonal


In [22]:
# same data cleaning process as for intelligence types will be done  for the column "mentor_rol" as users were able to give up to two priorities 


In [23]:
df5 = df4['mentor_rol'].str.split(';', expand=True)
df5.head(10)

Unnamed: 0,0,1
0,ofrecer orientación,entregar apoyo en la toma de decisiones
1,ofrecer orientación,entregar apoyo en la toma de decisiones
2,ofrecer orientación,motivar
3,ayudar a resolver problemas,motivar
4,ofrecer orientación,ayudar a resolver problemas
5,ofrecer orientación,ayudar a resolver problemas
6,motivar,
7,entregar apoyo en la toma de decisiones,tener disponibilidad para contestar preguntas
8,ofrecer orientación,ayudar a resolver problemas
9,entregar apoyo en la toma de decisiones,ayudar a resolver problemas


# here the name itype was considered indiscriminately, despite the fact it is not an intelligence type variable. 
# this will be however modified afterwards

In [24]:
df5.rename(columns={0: "itype4", 1: "itype5"}, inplace=True) 
df5.fillna(0) # check for null values

Unnamed: 0,itype4,itype5
0,ofrecer orientación,entregar apoyo en la toma de decisiones
1,ofrecer orientación,entregar apoyo en la toma de decisiones
2,ofrecer orientación,motivar
3,ayudar a resolver problemas,motivar
4,ofrecer orientación,ayudar a resolver problemas
...,...,...
59,ofrecer orientación,0
60,entregar apoyo en la toma de decisiones,tener disponibilidad para contestar preguntas
61,entregar apoyo en la toma de decisiones,tener disponibilidad para contestar preguntas
62,ofrecer orientación,tener disponibilidad para contestar preguntas


# Now we handle the data of mentor_role with string contains as a more efficient way of handling data and changing the names into an uniform variable name by each column by that will be assigned to all the variables in english from the 4 datasets

In [25]:

df5.loc[df5['itype4'].str.contains('resolver') ==True, 'itype4'] = 'solve_problems'
df5.loc[df5['itype5'].str.contains('resolver') ==True, 'itype5'] = 'solve_problems'
df5.loc[df5['itype4'].str.contains('apoyo') ==True, 'itype4'] = 'support'
df5.loc[df5['itype5'].str.contains('apoyo') ==True, 'itype5'] = 'support'
df5.loc[df5['itype4'].str.contains('disponibilidad') ==True, 'itype4'] = 'availability'
df5.loc[df5['itype5'].str.contains('disponibilidad') ==True, 'itype5'] = 'availability'
df5.loc[df5['itype4'].str.contains('frecer') ==True, 'itype4'] = 'guidance'
df5.loc[df5['itype5'].str.contains('frecer') ==True, 'itype5'] = 'guidance'
df5.loc[df5['itype4'].str.contains('tivar') ==True, 'itype4'] = 'motivate'
df5.loc[df5['itype5'].str.contains('tivar') ==True, 'itype5'] = 'motivate'
df5.loc[df5['itype5'].str.contains('explicar') ==True, 'itype5'] = 'solve_problems'

In [26]:
# this is how the dataset looks like now, and this will be concatenated to the other dataframe
df5.head(10) 

Unnamed: 0,itype4,itype5
0,guidance,support
1,guidance,support
2,guidance,motivate
3,solve_problems,motivate
4,guidance,solve_problems
5,guidance,solve_problems
6,motivate,
7,support,availability
8,guidance,solve_problems
9,support,solve_problems


# here the data is already prepared in terms of list separations, and now it is possible to translate the rest of data into english

In [27]:
# concatenating mentor_role dataset to general data_set having a final one with variables in english and spanish.
df_final = pd.concat([df4, df5], axis=1) 

In [28]:
df_final.head(2)

Unnamed: 0,mentor_attribute,mentor_rol,mentor_career,mentor_skills,session_freq,mentor_age,mentor_gender,mentee_age,mentee_gender,itype1,itype2,itype3,itype4,itype5
0,calma,ofrecer orientación;entregar apoyo en la toma ...,tecnología de la información (ti),conocimientos especializados en el área selecc...,dos veces al mes (15 - 30 minutos de sesión),no es relevante,no es relevante,25,hombre,existencial,interpersonal,intrapersonal,guidance,support
1,empatía,ofrecer orientación;entregar apoyo en la toma ...,arte,habilidad para enseñar,dos veces al mes (15 - 30 minutos de sesión),23-29,no es relevante,22,hombre,natural,musical,intrapersonal,guidance,support


In [29]:
# drop mentor_rol as it was converted into new columns (itype4 and itype5)
df_final.drop(['mentor_rol'], axis=1, inplace=True)

In [30]:
#quick overview of  the dataset that will be translated
df_final.head(5)

Unnamed: 0,mentor_attribute,mentor_career,mentor_skills,session_freq,mentor_age,mentor_gender,mentee_age,mentee_gender,itype1,itype2,itype3,itype4,itype5
0,calma,tecnología de la información (ti),conocimientos especializados en el área selecc...,dos veces al mes (15 - 30 minutos de sesión),no es relevante,no es relevante,25,hombre,existencial,interpersonal,intrapersonal,guidance,support
1,empatía,arte,habilidad para enseñar,dos veces al mes (15 - 30 minutos de sesión),23-29,no es relevante,22,hombre,natural,musical,intrapersonal,guidance,support
2,calma,diseño,habilidades comunicacionales,dos veces al mes (15 - 30 minutos de sesión),30-36,no es relevante,26,hombre,musical,existencial,interpersonal,guidance,motivate
3,calma,tecnología de la información (ti),habilidad para enseñar,una vez al mes (sesión de 30 a 60 minutos),no es relevante,no es relevante,18,hombre,musical,mathematical,existencial,solve_problems,motivate
4,entusiasmo,tecnología de la información (ti),habilidad para enseñar,una vez por semana (sesión de 15 a 30 minutos),23-29,no es relevante,18,hombre,existencial,,,guidance,solve_problems


In [31]:
#Translating values into english for column mentor_attribute
df_final.loc[df_final['mentor_attribute'].str.contains('alma'), 'mentor_attribute'] = 'calmness'
df_final.loc[df_final['mentor_attribute'].str.contains('empat'), 'mentor_attribute'] = 'empathy'
df_final.loc[df_final['mentor_attribute'].str.contains('tusiasmo'), 'mentor_attribute'] = 'enthusiasm'
df_final.loc[df_final['mentor_attribute'].str.contains('umildad'), 'mentor_attribute'] = 'modesty'
df_final.loc[df_final['mentor_attribute'].str.contains('culosidad'), 'mentor_attribute'] = 'meticulousness'

In [32]:
#Translating values into english for column mentor_career
df_final.loc[df_final['mentor_career'].str.contains('iseño'), 'mentor_career'] = 'design'
df_final.loc[df_final['mentor_career'].str.contains('ti'), 'mentor_career'] = 'it'
df_final.loc[df_final['mentor_career'].str.contains('rte'), 'mentor_career'] = 'art'
df_final.loc[df_final['mentor_career'].str.contains('nabilidad'), 'mentor_career'] = 'sustainability'
df_final.loc[df_final['mentor_career'].str.contains('dicina'), 'mentor_career'] = 'medicine'
df_final.loc[df_final['mentor_career'].str.contains('dimiento'), 'mentor_career'] = 'entrepeneurship'
df_final.loc[df_final['mentor_career'].str.contains('sicolo'), 'mentor_career'] = 'psychology/rrhh'
df_final.loc[df_final['mentor_career'].str.contains('keting'), 'mentor_career'] = 'marketing'
df_final.loc[df_final['mentor_career'].str.contains('mpras'), 'mentor_career'] = 'purchase/sales'
df_final.loc[df_final['mentor_career'].str.contains('yes'), 'mentor_career'] = 'law'
df_final.loc[df_final['mentor_career'].str.contains('vestiga'), 'mentor_career'] = 'research'
df_final.loc[df_final['mentor_career'].str.contains('esarrollo'), 'mentor_career'] = 'web development'



In [33]:
#Translating values into english for mentor skills
df_final.loc[df_final['mentor_skills'].str.contains('onocimientos'), 'mentor_skills'] = 'career know-how'
df_final.loc[df_final['mentor_skills'].str.contains('écnica'), 'mentor_skills'] = 'it skills'
df_final.loc[df_final['mentor_skills'].str.contains('escritura'), 'mentor_skills'] = 'writing skills'
df_final.loc[df_final['mentor_skills'].str.contains('lectura'), 'mentor_skills'] = 'reading skills'
df_final.loc[df_final['mentor_skills'].str.contains('nicacional'), 'mentor_skills'] = 'communication skills'
df_final.loc[df_final['mentor_skills'].str.contains('enseñar'), 'mentor_skills'] = 'teaching skills'


In [34]:
#Translating values into english for session _ freq
df_final.loc[df_final['session_freq'].str.contains('os veces al mes'), 'session_freq'] = 'twice per month ( 15 - 30 minute session)'
df_final.loc[df_final['session_freq'].str.contains('na vez al mes'), 'session_freq'] = 'once per month ( 30 - 60 minute session)'
df_final.loc[df_final['session_freq'].str.contains('lugar de se'), 'session_freq'] = 'only weekly email communication)'
df_final.loc[df_final['session_freq'].str.contains('semana'), 'session_freq'] = 'once per week ( 15 - 30 minute session)'

In [35]:
#Translating values into english for mentor age
df_final.loc[df_final['mentor_age'].str.contains('relevante'), 'mentor_age'] = 'not relevant'
#Translating values into english for mentee_gender
df_final.loc[df_final['mentee_gender'].str.contains('hombre'), 'mentee_gender'] = 'male'
df_final.loc[df_final['mentee_gender'].str.contains('mujer'), 'mentee_gender'] = 'female'
df_final.loc[df_final['mentee_gender'].str.contains('hombre transgénero'), 'mentee_gender'] = 'transgender male'
df_final.loc[df_final['mentee_gender'].str.contains('mujer transgénero'), 'mentee_gender'] = 'transgender female'
#Translating values into english formentor_gender
df_final.loc[df_final['mentor_gender'].str.contains('hombre'), 'mentor_gender'] = 'Male'
df_final.loc[df_final['mentor_gender'].str.contains('mujer'), 'mentor_gender'] = 'Female'
df_final.loc[df_final['mentor_gender'].str.contains('hombre transgénero'), 'mentor_gender'] = 'transgender male'
df_final.loc[df_final['mentor_gender'].str.contains('mujer transgénero'), 'mentor_gender'] = 'transgender female'
df_final.loc[df_final['mentor_gender'].str.contains('relevante'), 'mentor_gender'] = 'not relevant'

In [36]:
#rename columns from the originally coliumn "mentor_role"
df_final.rename(columns={"itype4": "mentor_role_1", "itype5": "mentor_role_2"}, inplace=True)
df_final.head(2)

Unnamed: 0,mentor_attribute,mentor_career,mentor_skills,session_freq,mentor_age,mentor_gender,mentee_age,mentee_gender,itype1,itype2,itype3,mentor_role_1,mentor_role_2
0,calmness,it,career know-how,twice per month ( 15 - 30 minute session),not relevant,not relevant,25,male,existencial,interpersonal,intrapersonal,guidance,support
1,empathy,art,teaching skills,twice per month ( 15 - 30 minute session),23-29,not relevant,22,male,natural,musical,intrapersonal,guidance,support


In [37]:
df_final.head()

Unnamed: 0,mentor_attribute,mentor_career,mentor_skills,session_freq,mentor_age,mentor_gender,mentee_age,mentee_gender,itype1,itype2,itype3,mentor_role_1,mentor_role_2
0,calmness,it,career know-how,twice per month ( 15 - 30 minute session),not relevant,not relevant,25,male,existencial,interpersonal,intrapersonal,guidance,support
1,empathy,art,teaching skills,twice per month ( 15 - 30 minute session),23-29,not relevant,22,male,natural,musical,intrapersonal,guidance,support
2,calmness,design,communication skills,twice per month ( 15 - 30 minute session),30-36,not relevant,26,male,musical,existencial,interpersonal,guidance,motivate
3,calmness,it,teaching skills,once per month ( 30 - 60 minute session),not relevant,not relevant,18,male,musical,mathematical,existencial,solve_problems,motivate
4,enthusiasm,it,teaching skills,once per week ( 15 - 30 minute session),23-29,not relevant,18,male,existencial,,,guidance,solve_problems


In [38]:
# QUICK ROUND OF CHECKING ALL VALUES BEFORE EXPORTING DATASET TO BE JOINED TO ADDITIONAL 2 others

In [39]:
df_final['itype1'].value_counts()

mathematical      16
natural           16
musical           14
existencial        9
intrapersonal      3
interpersonal      2
linguistic         2
corporal           1
visual_spacial     1
Name: itype1, dtype: int64

In [40]:
df_final['itype2'].value_counts()

mathematical      12
corporal           9
existencial        8
musical            8
interpersonal      6
linguistic         5
intrapersonal      2
visual_spacial     1
Name: itype2, dtype: int64

In [41]:
df_final['itype3'].value_counts()

intrapersonal     16
visual_spacial    12
corporal           6
existencial        4
interpersonal      4
linguistic         2
Name: itype3, dtype: int64

In [42]:
df_final['mentor_attribute'].value_counts()

empathy           20
meticulousness    14
enthusiasm        14
calmness          11
modesty            5
Name: mentor_attribute, dtype: int64

In [43]:
df_final.head(2)

Unnamed: 0,mentor_attribute,mentor_career,mentor_skills,session_freq,mentor_age,mentor_gender,mentee_age,mentee_gender,itype1,itype2,itype3,mentor_role_1,mentor_role_2
0,calmness,it,career know-how,twice per month ( 15 - 30 minute session),not relevant,not relevant,25,male,existencial,interpersonal,intrapersonal,guidance,support
1,empathy,art,teaching skills,twice per month ( 15 - 30 minute session),23-29,not relevant,22,male,natural,musical,intrapersonal,guidance,support


In [44]:
df_final['mentor_career'].value_counts()

it                 19
medicine           19
web development     8
entrepeneurship     6
psychology/rrhh     6
purchase/sales      2
art                 2
design              1
big data            1
Name: mentor_career, dtype: int64

In [45]:
df_final['mentor_skills'].value_counts()

teaching skills         36
career know-how         17
communication skills     8
it skills                2
reading skills           1
Name: mentor_skills, dtype: int64

In [46]:
df_final['session_freq'].value_counts()

once per week ( 15 - 30 minute session)      39
twice per month ( 15 - 30 minute session)    13
once per month ( 30 - 60 minute session)      6
only weekly email communication)              6
Name: session_freq, dtype: int64

In [47]:
df_final['mentor_age'].value_counts()

not relevant    40
23-29           14
30-36            8
37-42            2
Name: mentor_age, dtype: int64

In [48]:
df_final['mentor_gender'].value_counts()

not relevant    61
Male             2
Female           1
Name: mentor_gender, dtype: int64

In [49]:
df_final['mentee_age'].value_counts()

19      19
18      17
21       9
22       5
20       4
23       3
24       3
"18"     2
25       1
26       1
Name: mentee_age, dtype: int64

In [50]:
# as seen above, some users made the input with "" for age, therefore this must be changed to ensure data uniformity
df_final.loc[df_final['mentee_age'].str.contains('"18"'), 'mentee_age'] = '18'

In [51]:
df_final['mentee_age'].value_counts()

18    19
19    19
21     9
22     5
20     4
23     3
24     3
25     1
26     1
Name: mentee_age, dtype: int64

In [52]:
df_final['mentor_role_1'].value_counts()

guidance          34
support           17
availability       7
solve_problems     4
motivate           2
Name: mentor_role_1, dtype: int64

In [53]:
df_final['mentor_role_2'].value_counts()

availability      17
motivate          17
solve_problems    14
support            6
Name: mentor_role_2, dtype: int64

In [54]:
df_final.head()

Unnamed: 0,mentor_attribute,mentor_career,mentor_skills,session_freq,mentor_age,mentor_gender,mentee_age,mentee_gender,itype1,itype2,itype3,mentor_role_1,mentor_role_2
0,calmness,it,career know-how,twice per month ( 15 - 30 minute session),not relevant,not relevant,25,male,existencial,interpersonal,intrapersonal,guidance,support
1,empathy,art,teaching skills,twice per month ( 15 - 30 minute session),23-29,not relevant,22,male,natural,musical,intrapersonal,guidance,support
2,calmness,design,communication skills,twice per month ( 15 - 30 minute session),30-36,not relevant,26,male,musical,existencial,interpersonal,guidance,motivate
3,calmness,it,teaching skills,once per month ( 30 - 60 minute session),not relevant,not relevant,18,male,musical,mathematical,existencial,solve_problems,motivate
4,enthusiasm,it,teaching skills,once per week ( 15 - 30 minute session),23-29,not relevant,18,male,existencial,,,guidance,solve_problems


In [55]:
# this is the first dataset obtained from the 3 surveys and prepared for being merged with its mentor chilean pair.
df_final.to_csv('mentees_chile_cleaned.csv')