Data Visualisation

Data Visualisation

In [None]:
# import 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
sns.set(style="white", color_codes=True)

# load dataset
voice = pd.read_csv("../input/voice.csv")

In [None]:
# let's see what is going on here
voice.describe()

In [None]:
# meanfreq: mean frequency (in kHz)
# sd: standard deviation of frequency
# median: median frequency (in kHz)
# Q25: first quantile (in kHz)
# Q75: third quantile (in kHz)
# IQR: interquantile range (in kHz)
# skew: skewness (see note in specprop description)
# kurt: kurtosis (see note in specprop description)
# sp.ent: spectral entropy
# sfm: spectral flatness
# mode: mode frequency
# centroid: frequency centroid (see specprop)
# meanfun: average of fundamental frequency measured across acoustic signal
# minfun: minimum fundamental frequency measured across acoustic signal
# maxfun: maximum fundamental frequency measured across acoustic signal
# meandom: average of dominant frequency measured across acoustic signal
# mindom: minimum of dominant frequency measured across acoustic signal
# maxdom: maximum of dominant frequency measured across acoustic signal
# dfrange: range of dominant frequency measured across acoustic signal
# modindx: modulation index. Calculated as the accumulated absolute difference between adjacent measurements of fundamental frequencies divided by the frequency range

In [None]:
voice.head()

In [None]:
voice["meanfun"].order()

In [None]:
voice["label"].value_counts()

In [None]:
sns.FacetGrid(voice, hue="label", size=10).map(plt.scatter, "meanfun", "meanfreq").add_legend()
plt.show()

In [None]:
sns.FacetGrid(voice, hue="label", size=10).map(plt.scatter, "meanfun", "meandom").add_legend()
plt.show()

In [None]:
sns.FacetGrid(voice, hue="label", size=6).map(sns.kdeplot, "meanfun").add_legend()
plt.show()

In [None]:
# to see the effect of every feature ==> radviz cycle
from pandas.tools.plotting import radviz
radviz(voice, "label")
plt.show()

In [None]:
# let's see what is going on here
voice.describe()

In [None]:
# meanfreq: mean frequency (in kHz)
# sd: standard deviation of frequency
# median: median frequency (in kHz)
# Q25: first quantile (in kHz)
# Q75: third quantile (in kHz)
# IQR: interquantile range (in kHz)
# skew: skewness (see note in specprop description)
# kurt: kurtosis (see note in specprop description)
# sp.ent: spectral entropy
# sfm: spectral flatness
# mode: mode frequency
# centroid: frequency centroid (see specprop)
# meanfun: average of fundamental frequency measured across acoustic signal
# minfun: minimum fundamental frequency measured across acoustic signal
# maxfun: maximum fundamental frequency measured across acoustic signal
# meandom: average of dominant frequency measured across acoustic signal
# mindom: minimum of dominant frequency measured across acoustic signal
# maxdom: maximum of dominant frequency measured across acoustic signal
# dfrange: range of dominant frequency measured across acoustic signal
# modindx: modulation index. Calculated as the accumulated absolute difference between adjacent measurements of fundamental frequencies divided by the frequency range

In [None]:
voice.head()

In [None]:
voice["meanfun"].order()

In [None]:
voice["label"].value_counts()

In [None]:
sns.FacetGrid(voice, hue="label", size=10).map(plt.scatter, "meanfun", "meanfreq").add_legend()
plt.show()

In [None]:
sns.FacetGrid(voice, hue="label", size=10).map(plt.scatter, "meanfun", "meandom").add_legend()
plt.show()

In [None]:
sns.FacetGrid(voice, hue="label", size=6).map(sns.kdeplot, "meanfun").add_legend()
plt.show()

In [None]:
# to see the effect of every feature ==> radviz cycle
from pandas.tools.plotting import radviz
radviz(voice, "label")
plt.show()

Logistic Regression

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score

In [None]:
# preprocessing, split features and outcomes 
X = voice.iloc[:, :-1]
Y = voice.iloc[:, 20]

# convert label
label = labelEncoder()
y = label.fit_transform(Y)

In [None]:
# split data in training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1) 

In [None]:
# pipeline of operations
estimators = [('scl', StandardScaler()), ('pca', PCA(n_components=2)), ('clf', LogisticRegression(random_state=1))]

In [None]:
# fit the estimator
pipe_lr = Pipeline(estimators)
pipe_lr.fit(X_train, y_train)

In [None]:
# not sure about this !
# n_job decided how many cpu will you use to calc
# cv decided how many pices of data do you want to split
# mean accurary rate
scores = cross_val_score(estimator=pipe_lr, X=X_train, y=y_train, cv=10, n_jobs=1)

Learning Curves 

In [None]:
# learning curve for training 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.learning_curve import learning_curve

pipe_lr = Pipeline([('scl', StandardScaler()), ('clf', LogisticRegression(penalty='l2', random_state=0))])
train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_lr, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.title('Learning Curve')
plt.legend(loc='lower right')
plt.ylim([0.8, 1.0])
plt.show()

In [None]:
# learning curve for validation
from sklearn.learning_curve import validation_curve

param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
train_scores, test_scores = validation_curve(estimator=pipe_lr, X=X_train, y=y_train, param_name='clf__C', param_range=param_range, cv=10)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')
plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')

plt.grid()
plt.xscale('log')
plt.legend(loc='lower right')
plt.xlabel('Parameter C')
plt.ylabel('Accuracy')
plt.title('Validation Curve')
plt.ylim([0.8, 1.0])
plt.show()
