# Exploratory Data Analysis

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [3]:
speech = pd.read_csv('speech.tsv', sep='\t')
sentences = pd.read_csv('sentences.tsv', sep='\t')
users = pd.read_csv('users.tsv', sep='\t')

## Peeking at the data

In [4]:
speech.head()

Unnamed: 0,id,sentence,user,fileName,sample,time,vote
0,007b0xQfxvsmaYVRN7wS,mmujzi8AdOyGyLt95lNH,sutUe8ScwBdhopQvzr2VzUO28q03,audio/01f276e0-b451-439f-b686-6856fc11e2b4.webm,https://firebasestorage.googleapis.com/v0/b/ma...,1584562139945,default
1,022Bgg06giNtYdeKfvOs,eRBPHg3VacjjQeLRN1vB,2dEDqqeI3PZx0m2D9EBHUn7noTM2,audio/cd7ec9d9-adb4-45d8-a299-a631e3252991.webm,https://firebasestorage.googleapis.com/v0/b/ma...,1573228579118,11
2,048AWoisCWs2UWf0eEtA,rRKWjXZu1vlzFi0uqngC,mYTyO3jrvgZv1XU0pZwRbSwwuhC3,audio/8264035d-f9e6-4933-9f9e-97eb1ec83234.webm,https://firebasestorage.googleapis.com/v0/b/ma...,1580404212435,default
3,06o5F5pl9YRbHpSGQTKT,tQoJUX0JnMgsAnVOKfZw,2dEDqqeI3PZx0m2D9EBHUn7noTM2,audio/a21ab006-57bd-485e-bc3c-e5aa70e02575.webm,https://firebasestorage.googleapis.com/v0/b/ma...,1573229287922,11
4,07LFdQOyNS7TFj26pJUC,KZzOTLufc3E5QUuiGeJs,sutUe8ScwBdhopQvzr2VzUO28q03,audio/f42dc879-fae8-4386-b6f4-187d5f29e828.webm,https://firebasestorage.googleapis.com/v0/b/ma...,1584562993254,default


In [5]:
sentences.head()

Unnamed: 0,id,sentence,category
0,08vCWSixKguCxbN8Noel,സൂചികൊണ്ടെടുക്കേണ്ടത് തൂമ്പ കൊണ്ടെടുക്കരുത്.,proverb
1,09qAOEXP8rX17zgrzNb0,കയ്യിൽ കാശുണ്ടോ?,conversation
2,0AVDnHgMnGBJ9aexjJAk,ഞാൻ ഒരു പാട്ട് പാടട്ടെ.,conversation
3,0LztDTFtWsAw6JCpvx1S,സമയമെത്രയായി?,conversation
4,0RUQfytlKJx3E0rM2hwM,ഓണം വരാനൊരു മൂലം വേണം.,proverb


In [6]:
users.head()

Unnamed: 0,id,name
0,09eJLYsUg8XByGwaPX3P6oykfui2,Manu Kurakar
1,0aokYrDll4TMRR3WvXVvSMtbQBA3,VIDENSHUS VIMALA
2,10gWfkC1DFYukBgs1OI0EJEz41m2,Raju Hariharan
3,11nKjHmpoZPhPDxX60Dt2Gxb0DD2,NuraaruN88
4,19n9jw6zdmVj7kfN6FnU9PZTxk43,Anirudh Sankr


In [46]:
print(f"{speech.shape[0]} sounds has been recorded by {users.shape[0]} users")

2074 sounds has been recorded by 195 users


## Checking for null values

In [8]:
speech.isnull().sum()

id          0
sentence    0
user        0
fileName    0
sample      0
time        0
vote        0
dtype: int64

In [9]:
sentences.isnull().sum()

id          0
sentence    0
category    0
dtype: int64

- Fortunately there is no null values in our dataset

## Sentences based on various categories

In [68]:
df = sentences.category.value_counts()
colors = ["gold", "mediumturquoise", "darkorange", "lightgreen"]
fig = px.pie(df, df.index, df.values, labels={"index": "Categories"})
fig.update_traces(
    hoverinfo="label+percent",
    textinfo="value",
    textfont_size=20,
    marker=dict(colors=colors, line=dict(color="#000000", width=2)),
)
fig.update_layout(title="No of sentences belonging to each category in MSC dataset")

fig.show()

## Top 15 most contributed users to MSC

In [45]:
users_speech = pd.merge(speech, users, left_on='user', right_on='id')
users_speech.name.value_counts()[:15]

Muneer Jinnen          532
Kavya Manohar          253
Santhosh Thottingal    158
kavitha manohar        126
abdul azeez vengara    120
notsoperfectkarthi     118
default                 70
Jaya K                  56
Manohar MUNNUMAKAL      42
Ashish Thampi           42
Majo Davis              35
Hrishikesh K.B          34
vipin vijayan           31
Jyothis Jagan           25
Dr Jada                 25
Name: name, dtype: int64

## Total time duration

In [12]:
sum = 0
filenames = os.listdir('audio/')
for f in filenames:
    y, sr = librosa.load(os.path.join('audio/', f))
    sum = sum + librosa.get_duration(y=y, sr=sr)

NameError: name 'librosa' is not defined

## Categories of Speech sentences

In [69]:
speech_sentences = pd.merge(speech, sentences, left_on='sentence', right_on='id')
df = speech_sentences.category.value_counts()
colors = ["gold", "mediumturquoise", "darkorange", "lightgreen"]
fig = px.pie(df, df.index, df.values, labels={"index": "Categories"})
fig.update_traces(
    hoverinfo="label+percent",
    textinfo="value",
    textfont_size=20,
    marker=dict(colors=colors, line=dict(color="#000000", width=2)),
)
fig.update_layout(title="Categories of user utterances in MSC dataset")

fig.show()

## Good quality speech 

- criteria: min 3 upvotes

In [64]:
sound = speech[speech.vote != 'default']
sound['vote'] = sound['vote'].astype(str).astype(float)
good_sound = sound[sound['vote']>=3]

In [67]:
good_sound = good_sound.reset_index()
print("No of good sound sampeles is", good_sound.shape[0])
good_sound.head()

No of good sound sampeles is 283


Unnamed: 0,index,id,sentence,user,fileName,sample,time,vote
0,1,022Bgg06giNtYdeKfvOs,eRBPHg3VacjjQeLRN1vB,2dEDqqeI3PZx0m2D9EBHUn7noTM2,audio/cd7ec9d9-adb4-45d8-a299-a631e3252991.webm,https://firebasestorage.googleapis.com/v0/b/ma...,1573228579118,11.0
1,3,06o5F5pl9YRbHpSGQTKT,tQoJUX0JnMgsAnVOKfZw,2dEDqqeI3PZx0m2D9EBHUn7noTM2,audio/a21ab006-57bd-485e-bc3c-e5aa70e02575.webm,https://firebasestorage.googleapis.com/v0/b/ma...,1573229287922,11.0
2,7,0ErcCyR2OdDcFi31xmSz,0AVDnHgMnGBJ9aexjJAk,59Tt9EuW2DOtyckjvkMVmuPqRW13,audio/eb8b87cf-82a3-4a8e-91b8-775ae8250e7a.webm,https://firebasestorage.googleapis.com/v0/b/ma...,1586442218313,4.0
3,8,0Gbc6JQLVd6ah2P5Nz0D,2hkQEfpIiHauS7txxAH7,6FzGR8AJFJSEml6AIs55fEgJmqX2,audio/3f6ec7ce-e480-49ba-b53f-29990a3918e7.webm,https://firebasestorage.googleapis.com/v0/b/ma...,1572600034618,30.0
4,9,0I2iIh7OSMdWhvb85tMp,5FiRVyH58OhlpIak6NCX,oeaNxrE0uxNSfoDpdrCYGBGC7uC3,audio/63a28877-988f-4f87-a6dd-f0d2b5c24944.webm,https://firebasestorage.googleapis.com/v0/b/ma...,1572577161778,42.0
