In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Reading DataFrame**

In [1]:
df = pd.read_csv('../input/top-indian-colleges/College_data.csv')

In [1]:
df.head()

In [1]:
df.tail()

In [1]:
%%time
for col in df.columns:
    df[col] = np.where(df[col]=='--',np.nan,df[col])

In [1]:
col_to_cast = df.columns[5:]
df[col_to_cast] = df[col_to_cast].astype(float)

In [1]:
df.dtypes

**Find Missing Values**

In [1]:
df.isnull().sum()

**Vizualize percentage of missing values**

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,5))
sns.barplot(x=df.columns,y=df.isnull().sum()/len(df))
plt.xticks(rotation=90) # if the graph is big than rotation not required
plt.show()

**Find total number of unique colleges by state**

In [1]:
df_plot = df.groupby(by=['State']).College_Name.nunique()
plt.figure(figsize=(20,7))
plt.xticks(rotation=90)
sns.barplot(x=df_plot.index,y=df_plot)

**Find Total number college per state per stream**

In [1]:
g = df.groupby(by=['State','Stream']).College_Name.nunique()
g = g.reset_index()
g = g.rename(columns = {'College_Name':'Counts'})

In [1]:
sns.catplot(data=g,col='State',x='Stream',y='Counts',col_wrap=1,kind='bar',sharex=False,height=7,aspect=2)

**Placement Stats**

In [1]:
g = df.groupby(by=['State','Stream']).agg(min_placement=('Placement','min'),max_placement=('Placement','max'),mean_placement=('Placement','mean'))
g = g.reset_index()

In [1]:
def find_placement(state='Andaman'):
    sns.catplot(data=g[g.State==state],row='State',col='Stream',kind='bar',sharex=False,height=5,aspect=1)
    
find_placement('Gujarat')

**Fees of Colleges**

In [1]:
from re import sub
from decimal import Decimal

df.UG_fee = df.UG_fee.apply(lambda x : float(sub(r'[^\d.]', '', x)) if type(x)==str else np.nan)
df.PG_fee = df.PG_fee.apply(lambda x : float(sub(r'[^\d.]', '', x)) if type(x)==str else np.nan)

In [1]:
g1 = df.groupby(by=['State','Stream']).agg(UG_min_fee=('UG_fee','min'),UG_max_fee=('UG_fee','max'),UG_mean_fee=('UG_fee','mean'))
g2 = df.groupby(by=['State','Stream']).agg(PG_min_fee=('PG_fee','min'),PG_max_fee=('PG_fee','max'),PG_mean_fee=('PG_fee','mean'))

In [1]:
g1.loc[('Gujarat')]

In [1]:
g2.loc[('Karnataka')]