In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
basicData = pd.read_csv('Districtwise_Basicdata.csv',header=[1])
enrData = pd.read_csv('Districtwise_Enrollment_details_indicator.csv',header=[3])
schoolData = pd.read_csv('Districtwise_SchoolData.csv',header=[3])
teacherData = pd.read_csv('Districtwise_Teacher_indicator.csv',header=[3])

In [3]:
renameCol = {'Year' : 'year', 'ac_year': 'year', 'Statecd': 'statecd'}
dropCol = ['statename','distname']
basicData.drop(dropCol, axis=1,inplace=True)
basicData.rename(columns=renameCol, inplace=True)

In [4]:
teacherData.drop(dropCol, axis=1, inplace=True)
teacherData.rename(columns=renameCol, inplace=True)

In [5]:
dropCol = ['State Name ','distname']
schoolData.drop(dropCol, axis=1, inplace=True)
schoolData.rename(columns=renameCol, inplace=True)

In [6]:
enrData.drop(dropCol, axis=1, inplace=True)
enrData.rename(columns=renameCol, inplace=True)

In [7]:
left = basicData
right = teacherData
dflist = [schoolData, enrData]
for df in dflist:
    data = pd.merge(left, right, how='outer', on=['year', 'statecd','distcd'])
    left = data
    right = df

In [8]:
replaceVal = {'2012-13': 1, '2013-14': 2}
data.year = data.year.map(replaceVal)

In [9]:
data.columns.to_series().groupby(data.dtypes).groups

{dtype('int64'): Index(['year', 'statecd', 'distcd', 'blocks', 'clusters', 'villages',
        'totschools', 'tch_govt1', 'tch_govt2', 'tch_govt3',
        ...
        'Computer Sch4', 'Computer Sch5', 'Mdm 1', 'Mdm 2', 'Mdm 3', 'Mdm 5',
        'Mdm 6', 'Smc 1', 'Smc 2', 'Smc 4'],
       dtype='object', length=335),
 dtype('float64'): Index(['totpopulation', 'p_06_pop', 'p_urb_pop', 'sexratio', 'sexratio_06',
        'growthrate', 'p_sc_pop', 'p_st_pop', 'female_lit', 'schgovt4',
        'schgovt6', 'schgovt7', 'schgovt9', 'schpvt3', 'schpvt4', 'schpvt5',
        'schpvt7', 'schpvt9', 'Sch R Govt2', 'Sch R Govt3', 'Sch R Govt4',
        'Sch R Govt5', 'Sch R Govt6', 'Sch R Govt9', 'Sch R Pvt3', 'Sch R Pvt4',
        'Sch R Pvt5', 'Sch R Pvt9', 'Cls1 School3', 'Gtoilet Sch1',
        'Gtoilet Sch2', 'Gtoilet Sch3', 'Gtoilet Sch4', 'Gtoilet Sch5',
        'Gtoilet Sch6', 'Gtoilet Sch7', 'Gtoilet Sch', 'Sch 50enr3', 'Kitshed3',
        'Cls Minor3', 'Cls Other3', 'Cls Other', 'Sdg 1', 'S

In [10]:
data.overall_lit.unique()

array(['High', 'Low', 'Medium', nan], dtype=object)

In [11]:
data.overall_lit = data.overall_lit.map({'High': 1, 'Low': -1, 'Medium': 0})

In [12]:
data.isna().any()[data.isna().any() == True]

totpopulation        True
p_06_pop             True
p_urb_pop            True
sexratio             True
sexratio_06          True
growthrate           True
p_sc_pop             True
p_st_pop             True
overall_lit          True
female_lit           True
schgovt4             True
schgovt6             True
schgovt7             True
schgovt9             True
schpvt3              True
schpvt4              True
schpvt5              True
schpvt7              True
schpvt9              True
Sch R Govt2          True
Sch R Govt3          True
Sch R Govt4          True
Sch R Govt5          True
Sch R Govt6          True
Sch R Govt9          True
Sch R Pvt3           True
Sch R Pvt4           True
Sch R Pvt5           True
Sch R Pvt9           True
Cls1 School3         True
                     ... 
Tlm 7                True
Station P G          True
Residential Up B     True
Sch Un2              True
Sch Un4              True
Btoilet Sch6         True
Electric Sch4        True
Electric Sch

In [13]:
data.fillna(method='ffill', inplace=True)

In [14]:
# Function to clip and clam the data
def clip_clamp(x, mean, sd):
    # Checking whether the value is less than a differenced value between mean and standard deviation.
    if x < mean - 2*sd :
        return mean - 2*sd
    #Checking whether the value is greater than a differenced value between mean and standard deviation.
    elif x > mean + 2*sd :
        return mean + 2*sd
    # If above two conditions are not statisfied we will return the original value
    else :
        return x

In [15]:
# Function to smooth the data
def smooth_out(Total_data):
    for i in Total_data.columns:
        # Calculating the mean value
        mean = np.mean(Total_data[i].values, axis=0)
        # Calculating the standard deviation value
        sd = np.std(Total_data[i].values, axis=0)
        # Calculating the corrected value using clip and clamp function
        corrected = np.array([clip_clamp(x, mean, sd) for x in Total_data[i].values])
        # Storing the data in form of series
        Total_data[i] = pd.Series(corrected, index=Total_data[i].index)
    return Total_data

In [16]:
import numpy as np
data = smooth_out(data)

In [17]:
def corr_features(df,cols,bar=0.9):
    for c,i in enumerate(cols[:-1]):
        col_set = set(cols)
        for j in cols[c+1:]:
            if i==j:
                continue
           
            score = df[i].corr(df[j])
            
            if score>bar:
                cols = list(col_set-set([j]))
            if score<-bar:
                cols = list(col_set-set([j]))
    return cols

In [18]:
features = corr_features(data, data.columns)

In [19]:
features

['tch_bs1',
 'Enr Stch Sch5',
 'Cls Major',
 'tch_nr7',
 'Sch 50enr7',
 'Sdg 4',
 'Scr 35 Up',
 'Tlm 6',
 'No Fem Sch3',
 'tch_pvt3',
 'Smc 4',
 'tch_sc_m2',
 'tch_f_p1',
 'Pp Sch1',
 'Sch R Govt4',
 'Computer Sch5',
 'tch_sc_f5',
 'tch_st_m6',
 'Computer Sch2',
 'tch_bs4',
 'Attend Up G',
 'tch_f5',
 'tch_mph1',
 'tch_f_p5',
 'tch_f4',
 'Uniform P B',
 'Sch R Pvt2',
 'p_sc_pop',
 'Computer Sch1',
 'Sch R Govt2',
 'tch_nr_p5',
 'Sch 50enr2',
 'Tch1 School3',
 'Attend P G',
 'tch_nr5',
 'Sch Un2',
 'tch_pgrad2',
 'tch_s7',
 'tch_pgrad6',
 'tch_sc_f1',
 'tch_pgrad7',
 'Mdm 5',
 'Kitshed2',
 'Tot Cls2',
 'tch_s5',
 'tch_s4',
 'Sch R Pvt7',
 'tch_nr_p4',
 'tch_m_p2',
 'overall_lit',
 'tch_un6',
 'Cls Good2',
 'tch_grad7',
 'tch_pvt7',
 'Enr Stch Sch3',
 'Gtoilet Sch7',
 'tch_eduqual_nr5',
 'Sdg 7',
 'No Fem Sch2',
 'Sch Since 2003 7',
 'Tch1 School',
 'tch_f_p6',
 'Station Up G',
 'tch_st_f5',
 'Cls Good6',
 'Water Sch5',
 'Cls Other4',
 'schgovt2',
 'trn_tch_m1',
 'tch_f1',
 'Book P G',
 

In [20]:
features.remove('overall_lit')

In [21]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [22]:
scaler.fit(data[features])

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [23]:
data[features] = scaler.transform(data[features])

  """Entry point for launching an IPython kernel.


In [24]:
def callKnn(data,targets):
  X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=0.33)
  neigh = KNeighborsClassifier(n_neighbors=3)
  neigh.fit(X_train, y_train)
  predicted_labels = neigh.predict(X_test)
  return accuracy_score(y_test,predicted_labels)

In [25]:
callKnn(data,data.overall_lit)

0.6041189931350115