# Exploratory Data Analysis

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
speech = pd.read_csv('speech.tsv', sep='\t')
sentences = pd.read_csv('sentences.tsv', sep='\t')
users = pd.read_csv('users.tsv', sep='\t')

In [3]:
speech.head()

Unnamed: 0,id,sentence,user,fileName,sample,time,vote
0,007b0xQfxvsmaYVRN7wS,mmujzi8AdOyGyLt95lNH,sutUe8ScwBdhopQvzr2VzUO28q03,audio/01f276e0-b451-439f-b686-6856fc11e2b4.webm,https://firebasestorage.googleapis.com/v0/b/ma...,1584562139945,default
1,022Bgg06giNtYdeKfvOs,eRBPHg3VacjjQeLRN1vB,2dEDqqeI3PZx0m2D9EBHUn7noTM2,audio/cd7ec9d9-adb4-45d8-a299-a631e3252991.webm,https://firebasestorage.googleapis.com/v0/b/ma...,1573228579118,11
2,048AWoisCWs2UWf0eEtA,rRKWjXZu1vlzFi0uqngC,mYTyO3jrvgZv1XU0pZwRbSwwuhC3,audio/8264035d-f9e6-4933-9f9e-97eb1ec83234.webm,https://firebasestorage.googleapis.com/v0/b/ma...,1580404212435,default
3,06o5F5pl9YRbHpSGQTKT,tQoJUX0JnMgsAnVOKfZw,2dEDqqeI3PZx0m2D9EBHUn7noTM2,audio/a21ab006-57bd-485e-bc3c-e5aa70e02575.webm,https://firebasestorage.googleapis.com/v0/b/ma...,1573229287922,11
4,07LFdQOyNS7TFj26pJUC,KZzOTLufc3E5QUuiGeJs,sutUe8ScwBdhopQvzr2VzUO28q03,audio/f42dc879-fae8-4386-b6f4-187d5f29e828.webm,https://firebasestorage.googleapis.com/v0/b/ma...,1584562993254,default


In [4]:
sentences.head()

Unnamed: 0,id,sentence,category
0,08vCWSixKguCxbN8Noel,സൂചികൊണ്ടെടുക്കേണ്ടത് തൂമ്പ കൊണ്ടെടുക്കരുത്.,proverb
1,09qAOEXP8rX17zgrzNb0,കയ്യിൽ കാശുണ്ടോ?,conversation
2,0AVDnHgMnGBJ9aexjJAk,ഞാൻ ഒരു പാട്ട് പാടട്ടെ.,conversation
3,0LztDTFtWsAw6JCpvx1S,സമയമെത്രയായി?,conversation
4,0RUQfytlKJx3E0rM2hwM,ഓണം വരാനൊരു മൂലം വേണം.,proverb


In [5]:
users.head()

Unnamed: 0,id,name
0,09eJLYsUg8XByGwaPX3P6oykfui2,Manu Kurakar
1,0aokYrDll4TMRR3WvXVvSMtbQBA3,VIDENSHUS VIMALA
2,10gWfkC1DFYukBgs1OI0EJEz41m2,Raju Hariharan
3,11nKjHmpoZPhPDxX60Dt2Gxb0DD2,NuraaruN88
4,19n9jw6zdmVj7kfN6FnU9PZTxk43,Anirudh Sankr


In [6]:
print(f"{speech.shape[0]} sounds has been recorded by {users.shape[0]} users")

2074 sounds has been recorded by 195 users


## Checking for null values

In [7]:
speech.isnull().sum()

id          0
sentence    0
user        0
fileName    0
sample      0
time        0
vote        0
dtype: int64

In [8]:
sentences.isnull().sum()

id          0
sentence    0
category    0
dtype: int64

- Fortunately there is no null values in our dataset

## Sentences based on various categories

In [9]:
df = sentences.category.value_counts()
colors = ['gold', 'mediumturquoise', 'darkorange', 'lightgreen']
fig = px.pie(df,df.index,df.values,labels={'index':'Categories'})
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.update_layout(title='Categories of sentences in Malayalam Speech corpus dataset')

fig.show()


## Finding users who contributed most to MSC

In [11]:
speech.user.value_counts()[:15]

2dEDqqeI3PZx0m2D9EBHUn7noTM2    532
rjh2u8PthLd9AuPARWuuiRcZKfp1    253
59Tt9EuW2DOtyckjvkMVmuPqRW13    158
MYf90clR8ehYYBOp1NAncn85tuH3    126
sutUe8ScwBdhopQvzr2VzUO28q03    120
oeaNxrE0uxNSfoDpdrCYGBGC7uC3    118
Li6cyWawv0TavTANle4yTmzCDty2     56
vUdcLk4ROIR8CsaYAG6YsAmj15J3     43
4g8oHH1MgvOvECpoSKbWWrRpWT82     42
WnrwjCDN1PgNuCK4IrstUpDLJB93     42
LTokmTKpwaO6Bdb5fTtoSLYMc8e2     35
KPppnsxEYoYUpLTsjFfNl8exkYv1     34
L2VjTEBUwJVobWfHy9wPZo0C7EQ2     31
c2UZvQV9xbYfBUw847Pypqb5PsS2     25
AzQ1CqU4MZXCAVzqDnLE50H72Jm1     25
Name: user, dtype: int64

## Total time duration

In [None]:
sum = 0
filenames = os.listdir('audio/')
for f in filenames:
    y, sr = librosa.load(os.path.join('audio/', f))
    sum = sum + librosa.get_duration(y=y, sr=sr)