In [13]:
#Importing necessary packages
import pandas as pd
import numpy as np

In [14]:
# Reading the suicide data classified by profession
suicide_by_profession = pd.read_csv("Profession_profile_of_suicide_victims_state.csv")


In [15]:
suicide_by_profession.head()

Unnamed: 0,STATE/UT,Year,CAUSE,Male upto 14 years,Male 15-29 years,Male 30-44 years,Male 45-59 years,Male 60 years and above,Total Male,Female upto 14 years,Female 15-29 years,Female 30-44 years,Female 45-59 years,Female 60 years and above,Total Female,Grand Total
0,ANDHRA PRADESH,2001,House Wife,0,0,0,0,0,0,0,1132,811,397,104,2444,2444
1,ANDHRA PRADESH,2001,Service (Government),0,34,57,44,0,135,0,23,22,9,7,61,196
2,ANDHRA PRADESH,2001,Service (Private),1,250,275,138,10,674,5,43,24,28,6,106,780
3,ANDHRA PRADESH,2001,Public Sector Undertaking,0,32,36,29,13,110,0,4,5,1,0,10,120
4,ANDHRA PRADESH,2001,Student,56,202,19,18,0,295,46,115,5,9,0,175,470


In [16]:
suicide_by_profession = suicide_by_profession.rename(columns = {"STATE/UT": "State"})

In [20]:
suicide_by_profession.shape

(6300, 16)

In [18]:
#DATA CLEANING -> Removing what is unncessary

In [19]:
#The exploratory analysis revealed that there are some unneccessary rows at the end of the dataset, those would serve no purpose at all. So, it'd be better to just remove them.

discard = ['TOTAL']

suicide_by_profession = suicide_by_profession[~suicide_by_profession.State.str.contains('|'.join(discard))]

In [21]:
#DATA TRANSFORMATION -> making data suitable for analysis

In [22]:
#The next important step is to make changes to data so that it satisfies our analysis requirements. 

# First things first, we are only interested in the suicide data for the year 2011, so dropping all other years except this one.

#Secondly, we are only interested in knowing suicide numbers for unemployed people, so have to discard the rest of profession categories

suicide_by_profession = suicide_by_profession.loc[suicide_by_profession['Year'] == 2011]

suicide_by_profession = suicide_by_profession.loc[suicide_by_profession['CAUSE'] == "Unemployed"]


In [23]:
#Making sure that the data looks like how we want it to be 

suicide_by_profession 

Unnamed: 0,State,Year,CAUSE,Male upto 14 years,Male 15-29 years,Male 30-44 years,Male 45-59 years,Male 60 years and above,Total Male,Female upto 14 years,Female 15-29 years,Female 30-44 years,Female 45-59 years,Female 60 years and above,Total Female,Grand Total
155,ANDHRA PRADESH,2011,Unemployed,0,181,248,98,8,535,0,59,30,14,4,107,642
335,ARUNACHAL PRADESH,2011,Unemployed,0,5,2,0,0,7,1,2,0,0,0,3,10
515,ASSAM,2011,Unemployed,0,7,15,21,5,48,0,6,5,0,0,11,59
695,BIHAR,2011,Unemployed,0,14,22,12,1,49,0,5,2,0,0,7,56
875,CHHATTISGARH,2011,Unemployed,3,182,246,146,15,592,0,18,11,3,2,34,626
1055,GOA,2011,Unemployed,1,13,12,13,4,43,1,5,2,0,0,8,51
1235,GUJARAT,2011,Unemployed,2,163,182,82,14,443,0,24,18,6,0,48,491
1415,HARYANA,2011,Unemployed,4,91,146,75,16,332,0,15,11,17,1,44,376
1595,HIMACHAL PRADESH,2011,Unemployed,0,9,7,0,1,17,0,4,2,0,0,6,23
1775,JAMMU & KASHMIR,2011,Unemployed,0,2,12,4,0,18,0,15,3,0,0,18,36


In [24]:
#This data eventually has to be joined with the census 2011 data for relating unemployment and suicide rate.

#In order to make it suitable for merging, the CAUSE column has to be dropped. 

#As of now, all of the subsequent columns i.e. suicide count by age group and gender reflect the suicides committed by unemployed people only.

suicide_by_profession = suicide_by_profession.drop(['CAUSE'], axis = 1)

In [25]:
#Adjusting the index 

suicide_by_profession.reset_index(inplace = True, drop = True)


In [26]:
suicide_by_profession = suicide_by_profession.sort_values("State")

In [27]:
#Dataset after transformation

suicide_by_profession

Unnamed: 0,State,Year,Male upto 14 years,Male 15-29 years,Male 30-44 years,Male 45-59 years,Male 60 years and above,Total Male,Female upto 14 years,Female 15-29 years,Female 30-44 years,Female 45-59 years,Female 60 years and above,Total Female,Grand Total
28,A & N ISLANDS,2011,0,3,5,2,0,10,0,0,0,1,0,1,11
0,ANDHRA PRADESH,2011,0,181,248,98,8,535,0,59,30,14,4,107,642
1,ARUNACHAL PRADESH,2011,0,5,2,0,0,7,1,2,0,0,0,3,10
2,ASSAM,2011,0,7,15,21,5,48,0,6,5,0,0,11,59
3,BIHAR,2011,0,14,22,12,1,49,0,5,2,0,0,7,56
29,CHANDIGARH,2011,0,8,1,1,0,10,0,2,0,0,0,2,12
4,CHHATTISGARH,2011,3,182,246,146,15,592,0,18,11,3,2,34,626
30,D & N HAVELI,2011,0,0,0,0,0,0,0,0,0,0,0,0,0
31,DAMAN & DIU,2011,0,0,0,0,0,0,0,0,0,0,0,0,0
32,DELHI (UT),2011,1,210,130,25,4,370,1,40,10,3,1,55,425


In [15]:
suicide_by_profession.to_csv("profession.csv",header= True, index= False)

In [28]:
suicide_by_profession.shape

(35, 15)