# CREATIVE DESIGN PROJECT

### DATASET -- Indian Census
#### collected from  http://censusindia.gov.in/
#### A visual study of Indian Census and comparison of states and districts

#### Libraries used - Pandas , Matplotlib , Numpy

In [1]:
import os
import pandas as pd
import numpy as np
from pandas import DataFrame,Series
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

Import the dataset input from the folder
the dataset is in comma seprated file
In computing, a comma-separated values (CSV) file stores tabular data (numbers and text) in plain text. Each line of the file is a data record. Each record consists of one or more fields, separated by commas. The use of the comma as a field separator is the source of the name for this file format.

In [2]:
df = pd.read_csv("../input/all.csv")
df.head()

### All states and union territories

In [3]:
state = list(df.State)
from collections import Counter
c = Counter(state)
st_name = list(c.keys())
st_name

## The fields taken in account during the census

In [4]:
x = df.columns.values
x

#### Since the dataset has multiple entries for a single state, to be precise as many as districts in it, so to have a clear value of the dataset lets group it by states value 

In [5]:
newdf = df.groupby('State').sum()
newdf.head(36)

# Population

In [6]:
popdf = newdf[['Persons','Males','Females']]
popdf.head()

In [7]:
newdf = df.groupby('State').sum()
newdf.reset_index()

### Scatter Plot of population of different states and union territories the size of the dot gives a sense of the population of the particular state

In [38]:
plt.figure()
x = [i for i in range(35)]
#colors = np.random.rand(50)
plt.scatter(x = x,y=newdf.Persons,cmap='flag',s=newdf.Persons/200000,alpha = 0.5)
plt.xticks(x,st_name)
plt.xticks(rotation = 90)
plt.show()

### Lets have a more clear view of the individual population here is a barplot

In [34]:
ax = plt.figure(figsize=(100,200))
#ax =popdf.plot(kind = 'bar',color = ['Red','Yellow'],width = 1)
#plt.show()
ax =popdf.plot(kind = 'bar',cmap = 'Paired',width = 0.8,figsize=(30,10))

#ax.set_facecolor('black')
#

### The percentage of male and female in the whole population

In [39]:
plt.figure()
popdf = newdf[['Males','Females']]
sum_df = popdf.sum()
labels =  'Males','Females'
sizes = [sum_df['Males'],sum_df['Females']]
colors = ['yellowgreen','lightcoral']
plt.pie(sizes,labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')
plt.show()

In [11]:
totlpop = df['Persons'].sum()

In [12]:
sharedf = df[['Persons','State']]
sharedf = sharedf.groupby('State').sum()
sharedf['Share'] = sharedf['Persons']*1000/totlpop
plotdf = sharedf['Share']

In [13]:
plt.figure()
xi = plotdf.plot(kind='bar',width=0.8,figsize=(30,10))

### Lets answer the age long question of sex ratio i.e females per 1000 males and change in sex ratio in 6 months  after the female genocide awarness camp

In [14]:
sexdf = newdf[['Sex.ratio..females.per.1000.males.','Sex.ratio..0.6.years.']]
sexdf.head()
ax = sexdf.plot(kind = 'bar',color = ['Red','black'],width = 0.8,figsize=(30,10))

### Lets try to draw a relation between education and sex ratio

In [15]:
newdf['totedu']= newdf['Total.Educated']/1000
sx_edudf = newdf[['Sex.ratio..females.per.1000.males.','totedu']]
axi = sx_edudf.plot(kind='bar',width=1,cmap='Set1',figsize=(30,10))

In [40]:
sx_edudf.plot(kind='kde')

## The above plot shows that education and sex ratio are almost directly proportional, with increase in number of educated persons sex ratio improves

In [17]:
ax = sexdf.plot(kind='bar',width=1,cmap='flag',figsize=(30,10))
#ax = sexdf.plot(kind='bar',width=1,color=['magenta','cyan'],stacked = True)
#ax.set_facecolor('cyan')

# EDUCATION IN INDIA

#### The percentage of population classified as per their education level  

In [18]:
sum_df=newdf.sum()

labels = 'Graduate.and.Above','Below.Primary', 'Primary', 'Middle','Matric.Higher.Secondary.Diploma' 
sizes = [sum_df['Graduate.and.Above'],sum_df['Below.Primary'],sum_df['Primary'],sum_df['Middle'],sum_df['Matric.Higher.Secondary.Diploma']]
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue','red']
explode = (0.1, 0, 0, 0,0)
plt.figure()
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
 
plt.axis('equal')
plt.show()

### From above pie chart its clear that most of the Indian population are educated or are getting educated below graduation level only 7% are graduate 

# Lets see which state or union teritory is best to live

### comparision of all the states and union territories based on basic facilities like safe drinkingwater, electricity,roads,schools,medical facilities ,post office and telegraph facilities,Bus services extra provided per person 

In [19]:
evdf = df[['State','Persons','Safe.Drinking.water','Electricity..Power.Supply.','Electricity..domestic.','Electricity..Agriculture.','Primary.school','Middle.schools','Secondary.Sr.Secondary.schools','College','Medical.facility','Primary.Health.Centre','Primary.Health.Sub.Centre','Post..telegraph.and.telephone.facility','Bus.services','Paved.approach.road','Permanent.House']].groupby('State').sum()
evdf.reset_index()
#print(evdf.iloc[:,2])
for i in range(1,6):
    evdf.iloc[:,i]=evdf.iloc[:,i]/evdf['Persons']

In [20]:
evdf = evdf.drop('Persons',1)

In [21]:
evdf.head(50)

In [22]:
axi = evdf.plot(kind='bar',width=1,color = ['green','red','yellow','blue','black'],figsize=(30,10))

# NOW LETS HAVE A CLOSE LOOK ON EACH STATE JUST REPLACE THE STATE_NAME VARIABLE IN THE CODE BELOW AND HIT SHIFT+ENTER

In [23]:
ndf = df.set_index(['State','District'])
state_name = 'AN'
arudf = ndf.loc[state_name,['Persons', 'Males', 'Females','Sex.ratio..females.per.1000.males.', 'Sex.ratio..0.6.years.','Persons..literate', 'Males..Literate', 'Females..Literate','Persons..literacy.rate', 'Males..Literatacy.Rate','Females..Literacy.Rate','Safe.Drinking.water','Electricity..Power.Supply.','Electricity..domestic.','Electricity..Agriculture.','Primary.school','Middle.schools','Secondary.Sr.Secondary.schools','College','Medical.facility','Primary.Health.Centre','Primary.Health.Sub.Centre','Post..telegraph.and.telephone.facility','Bus.services','Paved.approach.road','Permanent.House']]
arudf.head()

## Total Population of individual districts

In [24]:
f1 = arudf.iloc[:,1]
x = []
for i in arudf.index:
    y = i.split(' ')
    if (len(y) == 7 ):
        y[1]+=y[2]
    x.append(y[1])
print(x)
m = [i for i in range(len(x))] 
#print(y)
plt.figure()
f1.plot(kind = 'bar',width = 0.8,cmap = 'hsv')
plt.show()
plt.xticks(m,x)

## MAles FEmales and sex ratio

In [25]:
f2 = arudf.iloc[:,1:4]
f2.plot(kind = 'bar',width = 0.8,color= ['Red','yellow','black'])
plt.xticks(m,x)

## No of male and female,literate population

In [26]:
f3 = arudf.iloc[:,5:8]
f3.plot(kind = 'bar',width = 0.8,cmap='hsv',figsize=(30,10))
plt.xticks(m,x)

## Safe drinking water in each district

In [27]:
f4 = arudf.iloc[:,[11]]
f4.plot(kind = 'bar',width = 0.8,cmap='Vega10',figsize=(30,10))
plt.xticks(m,x)

### bElow code handels missing and non-numeric data in dataset

In [28]:
def isnumber(x):
    try:
        float(x)
        return True
    except:
        return False

arudf = arudf[arudf.applymap(isnumber)]

## comparison of districts based on Electricity supply,Primary school,telegraph faciliites and number of permanent houses

In [29]:
f5 = arudf.iloc[:,12:]
#f5.head()
f5.plot(kind = 'bar',width = 0.8,cmap='hsv',figsize=(30,10))
plt.xticks(m,x)

In [30]:
import os
import pandas as pd
import numpy as np
from pandas import DataFrame,Series
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

In [31]:
df = pd.read_csv("../input/all.csv")
df.head()

In [32]:
state = list(df.State)
from collections import Counter
c = Counter(state)
st_name = list(c.keys())
st_name