In [1]:
import pandas as pd
import numpy as np
import pickle
from pprint import pprint
from sklearn import preprocessing

from sklearn.model_selection import StratifiedKFold

from statistics import mean, stdev

from sklearn.model_selection import train_test_split

from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import RandomizedSearchCV

# Classifiers
from sklearn.ensemble import RandomForestClassifier


# metrics are used to find accuracy or error
from sklearn import metrics

In [2]:
class training():
    
    df = pd.DataFrame()
    
    X = pd.DataFrame()
    scaled_X = pd.DataFrame()
    y = []
    
    X_train = pd.DataFrame()
    y_train = []
    X_val = pd.DataFrame()
    y_val = []
    y_pred = []
    lst_accu_stratified = []
    
    model = RandomForestClassifier()
    cross_val = True
    grid_search = False
    rf_random = RandomizedSearchCV(estimator = model, param_distributions = {'max_features': ['auto','sqrt']})
    

    def __init__(self, filename):
        self.df = pd.read_csv(filename)
        print(f"Read the python {filename} file ")
        
    def display(self):
        display(self.df.head())
        
    def list_columns(self):
        display(self.df.columns)
        
    def renaming_columns(self):
        self.df.columns = self.df.columns.str.strip()
        print("Stripped leading whitespaces in column names")
        self.df = self.df.rename(columns = {'dx':'target'})
        print("Renamed the column name from 'dx' to 'target'")
        display(self.df.columns)
        
    def checking_imbalance(self):
        display(self.df['target'].value_counts())
        
    def basic_analysis(self):
        display(self.df.describe())
        
    #Subtracting overall utterance mean foramnt of f2 and f1 and creating a new feature
    def feature_engineering(self):
        self.df = self.df.assign(diff_f2_f1=self.df['meanf2'] - self.df['meanf1'])
        print("Subtracting formant f2 and f1 for the whole utterance ")
        self.df.insert(6, 'diff_f2_f1', self.df.pop('diff_f2_f1'))
        print("Inserting the column in different position")
        display(self.df.head())
        
    #Normalizing the data due to different scales in all the columns 
    def scale_data(self):
        self.X = self.df.drop(columns=['target']).copy()
        self.y = self.df['target']
        scaler = preprocessing.MinMaxScaler()
        self.scaled_X = df = pd.DataFrame(scaler.fit_transform(self.X))
        display(self.scaled_X.head())
    
    #Splitting the data into train and val in 80:20 ratio
    def split_train_val(self):
        
        if self.cross_val == True:
            pass
        else:
            print("NOT USING STRATIFIED K FOLD")
            self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(self.scaled_X,self.y, train_size=0.8, stratify=self.y, random_state = 1)
            print("Splitted the 80% of the data for training and 20% of the data for validation")
            print("Shape of Training Data:", self.X_train.shape,"\n Shape of Training prediction column: ",len(self.y_train))
            print("Shape of Validation Data:", self.X_val.shape,"\n Shape of Validation prediction column: ",len(self.y_val))
     
    #Printing the paramsof Random Forest model
    def get_params(self):
        pprint(self.model.get_params())
        
    #Uing Grid Search CV to choose best perameters for the model training
    def grid_search(self):
        self.grid_search = True
        # Number of trees in random forest
        n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
        # Number of features to consider at every split
        max_features = ['auto', 'sqrt']
        # Maximum number of levels in tree
        max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
        max_depth.append(None)
        # Minimum number of samples required to split a node
        min_samples_split = [2, 5, 10]
        # Minimum number of samples required at each leaf node
        min_samples_leaf = [1, 2, 4]
        # Method of selecting samples for training each tree
        bootstrap = [True, False]
        # Create the random grid
        random_grid = {'n_estimators': n_estimators,
                       'max_features': max_features,
                       'max_depth': max_depth,
                       'min_samples_split': min_samples_split,
                       'min_samples_leaf': min_samples_leaf,
                       'bootstrap': bootstrap}
        pprint(random_grid)
        
        # Random search of parameters, using 5 fold cross validation, 
        # search across 100 different combinations, and use all available cores
        self.rf_random = RandomizedSearchCV(estimator = self.model, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=1, n_jobs = -1)
        # Fit the random search model
        self.rf_random.fit(self.scaled_X, self.y)
        print("Best Hyperparameter setting would be")
        print(self.rf_random.best_params_)
    
    #Training the data
    def train_data(self):
        #If grid search CV is performed then choose the best hyperparameters
        if self.grid_search == True:
            self.model = self.rf_random.best_estimator_
        #If cross validation is chosen then perform stratified cross validation
        if self.cross_val == True:
            print("USING STRATIFIED K FOLD")
            skf = StratifiedKFold(n_splits=20, shuffle=True, random_state=1)
            self.lst_accu_stratified = []
  
            for i, (train_index, test_index) in enumerate(skf.split(self.X, self.y)):
                print(f"Fold {i}:")
                print(f"  Train: index={train_index}")
                print(f"  Test:  index={test_index}")
                x_train_fold, x_test_fold = self.scaled_X.iloc[train_index], self.scaled_X.iloc[test_index]
                y_train_fold, y_test_fold = self.y.iloc[train_index], self.y.iloc[test_index]
                self.model.fit(x_train_fold, y_train_fold)
                self.lst_accu_stratified.append(self.model.score(x_test_fold, y_test_fold))
        # If Grid search and cross val are false then train normally with the default parameters
        else:
            
            # Training the model on the training dataset
            # fit function is used to train the model using the training sets as parameters
            print("Fitting the data on the Random Forest Classifier with default parameters")
            self.model.fit(self.X_train, self.y_train)

            # performing predictions on the test dataset
            print("Predicting data for the validation data")
            self.y_pred = self.model.predict(self.X_val)
                    
    def print_metrics(self):
        if self.cross_val == True:
            # Print the output.
            print('List of possible accuracy:', self.lst_accu_stratified)
            print('\nMaximum Accuracy That can be obtained from this model is:',
                  max(self.lst_accu_stratified)*100, '%')
            print('\nMinimum Accuracy:',
                  min(self.lst_accu_stratified)*100, '%')
            print('\nOverall Accuracy:',
                  mean(self.lst_accu_stratified)*100, '%')
            print('\nStandard Deviation is:', stdev(self.lst_accu_stratified))
        else:
            # using metrics module for accuracy calculation
            print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(self.y_val, self.y_pred))
            print(metrics.classification_report(self.y_val, self.y_pred))
            print(metrics.confusion_matrix(self.y_val, self.y_pred))
            
    def save_model(self):
        pickle.dump(self.model, open('training_model3.pkl', 'wb'))
        

In [3]:
obj = training("trainingdialectsdataoutput2.csv")

Read the python trainingdialectsdataoutput2.csv file 


In [4]:
obj.display()

Unnamed: 0,maxint,meanint,maxf1,meanf1,maxf2,meanf2,maxpitch,meanpitch,maxharmonicity,meanharmonicity,numberofpulses,numberofperiods,jitter,shimmer,unvoicedframes,voicebreaks,meanautocorrelation,meannoisetoharmonicsratio,meanharmonicstonoiseratio,dx
0,83.923979,71.581765,2226.93239,773.866883,3967.172332,1801.651925,514.737501,172.447258,44.809105,10.440155,1950,1882,0.01912,0.09421,0.48859,65,0.898809,0.138006,12.589,0
1,86.129707,72.526193,2930.52973,1246.513162,4107.236735,2422.437451,604.702641,259.249734,46.237202,5.201708,2428,1792,0.02496,0.17138,0.58812,94,0.71812,0.441715,4.519,0
2,78.960632,66.720782,2622.938312,822.587877,4120.452016,1957.931644,593.54125,120.090515,41.615041,7.845683,1261,1193,0.02358,0.1151,0.47402,55,0.858418,0.200226,9.754,0
3,83.501359,74.094033,2850.87548,696.628495,4132.153167,1916.040737,588.665705,197.442493,45.275869,11.337814,2544,2482,0.0199,0.07547,0.38127,55,0.919261,0.11166,14.233,0
4,83.844765,74.403868,2978.975257,759.61804,3880.324436,2116.483297,592.219033,180.912475,48.600159,3.869466,1667,1518,0.01601,0.11529,0.54707,88,0.742763,0.408569,5.355,0


In [5]:
obj.list_columns()

Index(['maxint', 'meanint', 'maxf1', 'meanf1', 'maxf2', 'meanf2', ' maxpitch',
       ' meanpitch', ' maxharmonicity', ' meanharmonicity', ' numberofpulses',
       ' numberofperiods', ' jitter', ' shimmer', ' unvoicedframes',
       ' voicebreaks', ' meanautocorrelation', ' meannoisetoharmonicsratio',
       ' meanharmonicstonoiseratio', ' dx'],
      dtype='object')

In [6]:
obj.renaming_columns()

Stripped leading whitespaces in column names
Renamed the column name from 'dx' to 'target'


Index(['maxint', 'meanint', 'maxf1', 'meanf1', 'maxf2', 'meanf2', 'maxpitch',
       'meanpitch', 'maxharmonicity', 'meanharmonicity', 'numberofpulses',
       'numberofperiods', 'jitter', 'shimmer', 'unvoicedframes', 'voicebreaks',
       'meanautocorrelation', 'meannoisetoharmonicsratio',
       'meanharmonicstonoiseratio', 'target'],
      dtype='object')

In [7]:
obj.checking_imbalance()

0    20
1    20
Name: target, dtype: int64

In [8]:
obj.feature_engineering()

Subtracting formant f2 and f1 for the whole utterance 
Inserting the column in different position


Unnamed: 0,maxint,meanint,maxf1,meanf1,maxf2,meanf2,diff_f2_f1,maxpitch,meanpitch,maxharmonicity,...,numberofpulses,numberofperiods,jitter,shimmer,unvoicedframes,voicebreaks,meanautocorrelation,meannoisetoharmonicsratio,meanharmonicstonoiseratio,target
0,83.923979,71.581765,2226.93239,773.866883,3967.172332,1801.651925,1027.785042,514.737501,172.447258,44.809105,...,1950,1882,0.01912,0.09421,0.48859,65,0.898809,0.138006,12.589,0
1,86.129707,72.526193,2930.52973,1246.513162,4107.236735,2422.437451,1175.924289,604.702641,259.249734,46.237202,...,2428,1792,0.02496,0.17138,0.58812,94,0.71812,0.441715,4.519,0
2,78.960632,66.720782,2622.938312,822.587877,4120.452016,1957.931644,1135.343767,593.54125,120.090515,41.615041,...,1261,1193,0.02358,0.1151,0.47402,55,0.858418,0.200226,9.754,0
3,83.501359,74.094033,2850.87548,696.628495,4132.153167,1916.040737,1219.412242,588.665705,197.442493,45.275869,...,2544,2482,0.0199,0.07547,0.38127,55,0.919261,0.11166,14.233,0
4,83.844765,74.403868,2978.975257,759.61804,3880.324436,2116.483297,1356.865256,592.219033,180.912475,48.600159,...,1667,1518,0.01601,0.11529,0.54707,88,0.742763,0.408569,5.355,0


In [9]:
obj.scale_data()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.679272,0.5211,0.023043,0.269264,0.72286,0.018918,0.188659,0.787652,0.482242,0.418591,0.835939,0.462784,0.508929,0.317138,0.201909,0.576363,0.45283,0.89832,0.079823,0.83076
1,0.981143,0.597489,0.660792,0.997587,0.882566,1.0,0.553893,0.98965,1.0,0.490796,0.245471,0.647127,0.470663,0.628938,0.853848,0.986644,1.0,0.0,1.0,0.0
2,0.0,0.127928,0.381988,0.34434,0.897635,0.2659,0.453843,0.964589,0.169946,0.2571,0.543495,0.197069,0.215986,0.555259,0.37839,0.516303,0.264151,0.697511,0.268337,0.538913
3,0.621433,0.724301,0.588592,0.150244,0.910977,0.199696,0.661112,0.953642,0.631333,0.442191,0.937121,0.691863,0.764031,0.358783,0.043592,0.133971,0.264151,1.0,0.0,1.0
4,0.668431,0.749361,0.704704,0.247307,0.623832,0.516474,1.0,0.961621,0.532736,0.610267,0.095304,0.353644,0.354167,0.151095,0.379995,0.817429,0.886792,0.122516,0.899574,0.086061


In [10]:
obj.split_train_val()

In [11]:
obj.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [12]:
obj.grid_search()

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Hyperparameter setting would be
{'n_estimators': 1200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': False}


In [13]:
obj.train_data()

USING STRATIFIED K FOLD
Fold 0:
  Train: index=[ 0  1  2  3  4  5  6  7  8  9 10 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 29 30 31 32 33 34 35 36 37 38 39]
  Test:  index=[11 28]
Fold 1:
  Train: index=[ 0  1  2  3  4  5  6  7  8 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 31 32 33 34 35 36 37 38 39]
  Test:  index=[ 9 30]
Fold 2:
  Train: index=[ 0  1  2  3  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 36 37 38 39]
  Test:  index=[ 4 35]
Fold 3:
  Train: index=[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 24 25
 26 27 28 29 30 31 32 33 34 35 36 37 38 39]
  Test:  index=[ 0 23]
Fold 4:
  Train: index=[ 0  1  2  3  4  5  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 35 36 37 38 39]
  Test:  index=[ 6 34]
Fold 5:
  Train: index=[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 20 21 22 23 24
 25 27 28 29 30 31 32 33 34 35 36 37 38 39]
  Test:  index=[19 26

In [14]:
obj.print_metrics()

List of possible accuracy: [1.0, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5, 0.5, 0.0, 1.0, 0.5, 0.5, 1.0, 1.0, 1.0, 0.5, 1.0, 0.5, 0.5]

Maximum Accuracy That can be obtained from this model is: 100.0 %

Minimum Accuracy: 0.0 %

Overall Accuracy: 62.5 %

Standard Deviation is: 0.31933318682925255


In [15]:
#obj.save_model() Commented out due to not overwriting the saved model

[CV] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=1200; total time=   1.0s
[CV] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=1200; total time=   1.1s
[CV] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=1800; total time=   1.7s
[CV] END bootstrap=True, max_depth=80, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1400; total time=   1.6s
[CV] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1400; total time=   1.2s
[CV] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=1000; total time=   1.0s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=800; total time=   0.9s
[CV] END bootstrap=True, max_dep

[CV] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=1200; total time=   1.1s
[CV] END bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=   0.4s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=   0.9s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=600; total time=   0.6s
[CV] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=600; total time=   0.5s
[CV] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=600; total time=   0.5s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=1600; total time=   1.8s
[CV] END bootstrap=False, max_depth=

[CV] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=1400; total time=   1.2s
[CV] END bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=   0.4s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=   0.9s
[CV] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=1800; total time=   1.6s
[CV] END bootstrap=True, max_depth=80, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1400; total time=   1.5s
[CV] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1400; total time=   1.2s
[CV] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=1000; total time=   0.9s
[CV] END bootstrap=True, max_dept

[CV] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=1400; total time=   1.2s
[CV] END bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=   0.5s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=   0.9s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=600; total time=   0.5s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=600; total time=   0.6s
[CV] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=600; total time=   0.6s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=1600; total time=   1.7s
[CV] END bootstrap=False, max_depth

[CV] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=1400; total time=   1.2s
[CV] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=1200; total time=   1.1s
[CV] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=1800; total time=   1.6s
[CV] END bootstrap=True, max_depth=80, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1400; total time=   1.5s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=1600; total time=   1.9s
[CV] END bootstrap=False, max_depth=110, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=   0.4s
[CV] END bootstrap=False, max_depth=110, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=   0.4s
[CV] END bootstrap=False, max_dept