In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

invalid = pd.read_csv('cv-invalid.csv')
valid_train = pd.read_csv('cv-valid-train.csv')
valid_dev = pd.read_csv('cv-valid-dev.csv')
valid_test = pd.read_csv('cv-valid-test.csv')
other_train = pd.read_csv('cv-other-train.csv')
other_dev = pd.read_csv('cv-other-dev.csv')
other_test = pd.read_csv('cv-other-test.csv')

In [None]:
voices_df = pd.concat([invalid,valid_train,valid_dev,valid_test,other_train,other_dev,other_test])
voices_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 380368 entries, 0 to 2960
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   filename    380368 non-null  object 
 1   text        380366 non-null  object 
 2   up_votes    380368 non-null  int64  
 3   down_votes  380368 non-null  int64  
 4   age         149021 non-null  object 
 5   gender      149281 non-null  object 
 6   accent      129234 non-null  object 
 7   duration    0 non-null       float64
dtypes: float64(1), int64(2), object(5)
memory usage: 26.1+ MB


In [None]:
voices_df = voices_df.drop('filename', axis = 1)
voices_df = voices_df.drop('duration', axis = 1)
voices_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 380368 entries, 0 to 2960
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   text        380366 non-null  object
 1   up_votes    380368 non-null  int64 
 2   down_votes  380368 non-null  int64 
 3   age         149021 non-null  object
 4   gender      149281 non-null  object
 5   accent      129234 non-null  object
dtypes: int64(2), object(4)
memory usage: 20.3+ MB


In [None]:
#samples w/ no votes aren't helpful
voices_df = voices_df[(voices_df['up_votes'] + voices_df['down_votes']) > 0]
voices_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 238218 entries, 0 to 2951
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   text        238217 non-null  object
 1   up_votes    238218 non-null  int64 
 2   down_votes  238218 non-null  int64 
 3   age         85410 non-null   object
 4   gender      85735 non-null   object
 5   accent      74696 non-null   object
dtypes: int64(2), object(4)
memory usage: 12.7+ MB


In [None]:
#since the number of votes on each sample varies so much, the percentage of upvotes is a more useful metric
voices_df['total_votes'] = voices_df['up_votes'] + voices_df['down_votes']
voices_df['percent_correct'] = voices_df['up_votes'] / voices_df['total_votes']
voices_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 238218 entries, 0 to 2951
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   text             238217 non-null  object 
 1   up_votes         238218 non-null  int64  
 2   down_votes       238218 non-null  int64  
 3   age              85410 non-null   object 
 4   gender           85735 non-null   object 
 5   accent           74696 non-null   object 
 6   total_votes      238218 non-null  int64  
 7   percent_correct  238218 non-null  float64
dtypes: float64(1), int64(3), object(4)
memory usage: 16.4+ MB


In [None]:
#Probably better to focus on samples reviewed by more than one person
voices_multi_valid = voices_df[(voices_df['up_votes'] + voices_df['down_votes']) > 1]
voices_multi_valid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 145041 entries, 0 to 2951
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   text             145040 non-null  object 
 1   up_votes         145041 non-null  int64  
 2   down_votes       145041 non-null  int64  
 3   age              48283 non-null   object 
 4   gender           48625 non-null   object 
 5   accent           43092 non-null   object 
 6   total_votes      145041 non-null  int64  
 7   percent_correct  145041 non-null  float64
dtypes: float64(1), int64(3), object(4)
memory usage: 10.0+ MB


In [None]:
#pd.DataFrame.value_counts(voices_df, subset = 'accent')
pd.DataFrame.value_counts(voices_multi_valid, subset = 'accent')

accent
us                21193
england            9036
australia          2918
indian             2890
canada             2785
african            1017
scotland            927
ireland             658
newzealand          644
philippines         269
malaysia            189
bermuda             170
wales               163
singapore           112
southatlandtic       68
hongkong             53
dtype: int64

In [None]:
#accent_class = voices_df.copy()
accent_class = voices_multi_valid.copy()
accent_class = accent_class.groupby('accent').mean()
accent_class

  accent_class = accent_class.groupby('accent').mean()


Unnamed: 0_level_0,up_votes,down_votes,total_votes,percent_correct
accent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
african,2.949853,0.473943,3.423795,0.854352
australia,3.115147,0.366004,3.481151,0.894433
bermuda,3.141176,0.541176,3.682353,0.863581
canada,3.208259,0.331059,3.539318,0.916431
england,3.27977,0.302125,3.581895,0.916308
hongkong,1.867925,0.490566,2.358491,0.791824
indian,2.709343,0.62872,3.338062,0.806547
ireland,3.12462,0.352584,3.477204,0.902907
malaysia,6.433862,5.439153,11.873016,0.748623
newzealand,3.625776,0.236025,3.861801,0.943558


In [None]:
#age_group = voices_df.copy()
age_group = voices_multi_valid.copy()
age_group = age_group.groupby('age').mean()
age_group

  age_group = age_group.groupby('age').mean()


Unnamed: 0_level_0,up_votes,down_votes,total_votes,percent_correct
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
eighties,2.581081,0.27027,2.851351,0.909122
fifties,3.041836,0.2771,3.318936,0.909309
fourties,2.911005,0.28011,3.191115,0.915048
seventies,3.026188,0.303589,3.329777,0.904442
sixties,2.819149,0.287234,3.106383,0.909162
teens,3.068852,0.593208,3.662061,0.84221
thirties,3.270972,0.328813,3.599784,0.911776
twenties,3.34948,0.525134,3.874614,0.87921


In [None]:
#gender = voices_df.copy()
gender = voices_multi_valid.copy()
gender = gender.groupby('gender').mean()
gender

  gender = gender.groupby('gender').mean()


Unnamed: 0_level_0,up_votes,down_votes,total_votes,percent_correct
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,3.039125,0.368032,3.407157,0.895187
male,3.227591,0.412454,3.640045,0.895194
other,3.5125,0.49375,4.00625,0.876319
