In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder
import statsmodels.api         as     sm
from   statsmodels.formula.api import ols
from   statsmodels.stats.anova import anova_lm
import copy

In [None]:
#I've modified the name of the data columns and its position for clear reference,
#use the placement_data file with same data

In [None]:
%matplotlib inline

In [None]:
df = pd.read_csv('../input/placement-data/Placement_Data.csv')

In [None]:
df.head()

# UNI-Variate analysis.

In [None]:
df.info()

## The Data has 251 instances with 15 attributes with 1 integer type, 6 float type and 8 object(Strings) type.

## Here the Target Variable in the dataset is Placement Status. 

## Categorical Attributes:
1.Gender
2.Xth board.
3.XII board.
4.XII Stream
5.UG Field
6.PG Field
7.Work Experience
   

## Numerical Attribure:
1.X percentage.
2.XII percentage.
3.UG Percentage
4.PG Percentage(MBA)
5.Salary

In [None]:
#null value check.
df.isna().apply(pd.value_counts)

# There is totally 67 null values in column Salary

In [None]:
df.describe(include='all').T

# According to the data set

- 65.64% of the overall students in the dataset are Male.
- It can be seen in the dataset that 46.05% of the students graduated in X from central board.
- 39.07% of the students graduated in XII from central board.
- ~53% of the students took Commerce as their stream and only 5.12% took arts, rest were Science students.
- More than 65% of the overall students specialised in Science and tech during UG.
- Nearly 65% of the students have former workex.
- 55.81% of the students specialised in Marketing and HR, while 44.19% specialised in Marketing and Finance.
- Percentage of all exams have been normally distributed which we can further see in the graphs below
- How do the percentage of all exams and the degree of students impact the placements can be seen further

In [None]:
df.describe().T

In [None]:
#Plots to see the distribution of the continuous features individually

plt.figure(figsize= (20,15))
plt.subplot(3,3,1)
plt.hist(df.X_P, color='crimson', edgecolor = 'black', alpha = 1)
plt.xlabel('Xth')

plt.subplot(3,3,2)
plt.hist(df.XII_P, color='darkgrey', edgecolor = 'black', alpha = 0.7)
plt.xlabel('XII')

plt.subplot(3,3,3)
plt.hist(df.UG_P, color='lime', edgecolor = 'black', alpha = 0.7)
plt.xlabel('UG')

plt.subplot(3,3,4)
plt.hist(df.PG_P, color='gold', edgecolor = 'black', alpha = 0.7)
plt.xlabel('PG ')

plt.subplot(3,3,5)
plt.hist(df.Etest_P, color='cornflowerblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('Employability Test')


plt.subplot(3,3,6)
plt.hist(df.Salary, color='hotpink', edgecolor = 'black', alpha = 0.7)
plt.xlabel('Salary')
plt.show()


##     -As seen in the graph Xth,XIIth, UG and PG Percentage have seen to be Normally Distributed.

##     -Employability Test Percentage is quite Uniformly distributed. 

In [None]:
df.Salary.dropna(inplace=True)

#### As we are calculating skewness of salary, we are supposed to drop all the non placed students. If incase we replace the null values with 0 it will impact the skewness value and we dont get the desired output.

In [None]:
# Measuring the skewness of required columns.
Skewness = pd.DataFrame({'Skewness' : [stats.skew(df.X_P),stats.skew(df.XII_P),stats.skew(df.UG_P),
                                      stats.skew(df.PG_P),stats.skew(df.Etest_P),stats.skew(df.Salary)]},
                        index=['X_P','XII_P','UG_P','PG_P','Etest_P','Salary'])  # Measure the skeweness of the required columns
Skewness

##     -There hardly seems to be any skewness in any of the columns except salary which is highly skewed.

In [None]:
df['Salary'].plot(kind='density')
plt.vlines(df['Salary'].mean(),ymin=0,ymax=0.000007,color='red')
plt.vlines(df['Salary'].median(),ymin=0,ymax=0.000007,color='green')

In [None]:
df = pd.read_csv('../input/placement-data/Placement_Data.csv')

##     -Here Mean is larger than Median and the graph shows it as positive skew.

In [None]:
#boxplot

plt.figure(figsize= (15,15))
plt.subplot(3,1,1)
sns.boxplot(x= df.X_P)

plt.subplot(3,1,2)
sns.boxplot(x= df.XII_P,color='pink')

plt.subplot(3,1,3)
sns.boxplot(x= df.UG_P, color='magenta')

plt.figure(figsize= (20,15))
plt.subplot(3,1,1)
sns.boxplot(x= df.PG_P, color='darkturquoise')

plt.subplot(3,1,2)
sns.boxplot(x= df.Etest_P, color='mediumspringgreen')

plt.subplot(3,1,3)
sns.boxplot(x= df.Salary, color='lightblue')
plt.show()

##     -Only Higher Secondary Education and Degree Percentage columns has  very less extreme values where as the Salary column is highly skewed, we can see quite a few extreme values.

In [None]:
#Students placed according to Gender
gender_placed_record = df.Status.groupby(df.Gender)
gender_placed_record.value_counts()

In [None]:
# countplot for the above observation.
sns.countplot(df.Gender, hue=df.Status,palette='winter');

### From the above analysis we can see that out of 215, total of 148 students have been placed and 67 students are not placed. Out of 148 students placed,100 students are male and 48 students are female and out of 67 not placed students 39 students are male and 28 are female students.

## Which UG stream has more no of placements? 

In [None]:
#Students placed according to their department in UG
dept_status_record = df.Status.groupby([df.UG_Field])
dept_status_record.value_counts()

In [None]:
#countplot for above observation
sns.violinplot(x="UG_Field", y="Salary", data=df)
sns.stripplot(x="UG_Field", y="Salary", data=df,hue='Status')

#### With the above data we can infer that Comm&Mgmt students are getting high placements followed by Sci&Tech students, where as others have very few placements. 

In [None]:
#similarly we can also see the stats for PG specialization.
dept_status_pg_record = df.Status.groupby([df.PG_Specialization])
dept_status_pg_record.value_counts()

In [None]:
#countplot for above observation
sns.countplot(df.PG_Specialization, hue=df.Status);

#### With respect to PG specialization we can infer that Mkt&Fin students have got more placements than Mkt&HR students.

## Bi-variate Distributions of every possible pair.

In [None]:
#Label encoding the variables before doing a pairplot because pairplot ignores strings
df_encoded = copy.deepcopy(df)
df_encoded.loc[:,['Gender','X_Board','XII_Board','XII_Stream','UG_Field','PG_Specialization','Work_exp']] = df_encoded.loc[:,['Gender','X_Board','XII_Board','XII_Stream','UG_Field','PG_Specialization','Work_exp']].apply(LabelEncoder().fit_transform) 
plt.figure(figsize= (25,25))
sns.pairplot(df_encoded)  #pairplot
plt.show()

In [None]:
sns.pairplot(df, hue='Status')


#### In the first pair plot we can see that all values are normally distributed, in the diagnol plots

#### Here we can also see the correlation between percentages of students with salary.

#### In the second pairplot with respect to the status placed or not, we can see that the students with percentage more than 60 in Xth has higher chance of getting placed, same for  XII and UG.  The scenario is different with respect PG percentage 

In [None]:
#Using Pearson Correlation
df = pd.read_csv('../input/placement-data/Placement_Data.csv')
del df['Sl_No']
df.Salary.fillna(value=0,inplace=True)
numeric_data = df.select_dtypes(include=[np.number])
plt.figure(figsize=(12,10))
cor = numeric_data.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
df.corr()

#### In the above heat map we can see there is moderatae correlation between Salary and X,XII,UG percentage but PG and Etest percentage have low correaltion with Salary.

## To get placed with a Highest package, which Specialization should i choose in PG?

In [None]:

#Scatter plot to look for visual evidence of dependency between attributes smoker and charges accross different ages
plt.figure(figsize=(8,6))
sns.scatterplot(df.PG_P, df.Salary,hue=df.PG_Specialization,palette= ['red','green'],alpha=0.9)
plt.show()


#### From the above graph we can infer that choosing central board in XII has high probability of getting highest package.

## Who is not getting placed?

In [None]:
sns.catplot(x="Status", y="X_P", data=df,kind="swarm")
sns.catplot(x="Status", y="XII_P", data=df,kind="swarm",hue='Gender')
sns.catplot(x="Status", y="UG_P", data=df,kind="swarm",hue='Gender')

#### Here infering with respect to percentage of students in their X,XII and Degree, The students who have scored less than 60 percent in 10th or 12th or degree are mostly not getting placed. 

#  Statement of Significance

##  Is salary influenced by factors like Gender, Specialization and Work Experience?

### Stating Null and Alternate Hypothesis

The Null hypothesis of each set is given below
- 1)The means of the factor (Gender) are equal.
- 2)The means of the second factor(PG Specialisation)are equal.
- 3)The means of the third factor (Work Experience) are equal.

Alternate Hypothesis:
- 1)The means of the first factor (Gender) are not equal.
- 2) The means of the second factor (PG Specialization) are not equal.
- 3) The means of the third factor (Work Experience) are not equal.

In [None]:
df.Salary = df.Salary.fillna(0)

In [None]:
formula = 'Salary ~ C(Gender) + C(X_Board) + C(XII_Board) + C(XII_Stream) + C(UG_Field) + C(PG_Specialization) + C(Work_exp)'
model = ols(formula, data= df).fit()
aov_table = anova_lm(model, typ=1)

print(aov_table)

### Conclusion:

In this example

- p value for Gender is 0.025631 and < 0.05 so we reject the null hypothesis (1) and conclude that the Gender is having an  effect on Salary.
- p value for Specialisation is 0.000046 and < 0.05 so we reject the null hypothesis (2) and conclude that the Specialisation is having an effect on Salary.
- p value for Work_experience is 0.000133 and < 0.05 so we reject the null hypothesis (3) and conclude that the Work_experience is having an effect on Salary.

## Does Gender have an effect on Placement?

In [None]:
ata_crosstab = pd.crosstab(df['Gender'], df['Status'], margins = False) 
ata_crosstab

In [None]:
# Chi_square test to check if Placement status are different for different Genders
Ho = "Gender has no effect on Job status"   # Stating the Null Hypothesis
Ha = "Gender has no effect on Job status"   # Stating the Alternate Hypothesis

chi, p_value, dof, expected =  stats.chi2_contingency(ata_crosstab)

if p_value < 0.05:  # Setting our significance level at 5%
    print(f'{Ha} as the p_value ({p_value.round(3)}) < 0.05')
else:
    print(f'{Ho} as the p_value ({p_value.round(3)}) > 0.05')

#### Placement is irrespective of genders.

## Does placement status differ with different specialization in PG?

In [None]:
ata_crosstab = pd.crosstab(df['PG_Specialization'], df['Status'], margins = False) 
ata_crosstab

In [None]:
# Chi_square test to check if Placement status are different for different Specialisation
Ho = "Specialization has no effect on Placement status"   # Stating the Null Hypothesis
Ha = "Specialization has an effect on Placement status"   # Stating the Alternate Hypothesis

chi, p_value, dof, expected =  stats.chi2_contingency(ata_crosstab)

if p_value < 0.05:  # Setting our significance level at 5%
    print(f'{Ha} as the p_value ({p_value.round(3)}) < 0.05')
else:
    print(f'{Ho} as the p_value ({p_value.round(3)}) > 0.05')


#### Specialization do have an effect on Placements.

## If the PG specialization have an effect on Placement, do percentage scored in PG also have an effect?

In [None]:
# T-test to check dependency of percentage on placement
Ho = "PG_P of Placed and non-Placed are same"   # Stating the Null Hypothesis
Ha = "PG_P of Placed and non-Placed are not the same"   # Stating the Alternate Hypothesis

x = df.loc[df.Status == 'Placed', "PG_P"].values  
y = df.loc[df.Status == 'Not Placed', "PG_P"].values

t, p_value  = stats.ttest_ind(x,y, axis = 0)  #Performing an Independent t-test

if p_value < 0.05:  # Setting our significance level at 5%
    print(f'{Ha} as the p_value ({p_value}) < 0.05')
else:
    print(f'{Ho} as the p_value ({p_value}) > 0.05')

#### As we failed to reject the Null hypothesis, we can infer that percentage of a student in PG does have an effect in his/her placements.

## Does percentage scored in PG influence the job placement with respect to specialization?

In [None]:
# T-test to check dependency for
Ho = "MBA_P of Placed and non-Placed are same"   # Stating the Null Hypothesis
Ha = "MBA_P of Placed and non-Placed are not the same"   # Stating the Alternate Hypothesis

x = df.loc[(df.Status == 'Placed') & (df.PG_Specialization =='Mkt&Fin'), "PG_P"].values  
y = df.loc[(df.Status == 'Not Placed') & (df.PG_Specialization == 'Mkt&Fin'), "PG_P"].values 

t, p_value  = stats.ttest_ind(x,y, axis = 0)  #Performing an Independent t-test

if p_value < 0.05:  # Setting our significance level at 5%
    print(f'{Ha} as the p_value ({p_value}) < 0.05')
else:
    print(f'{Ho} as the p_value ({p_value}) > 0.05')

In [None]:
# T-test to check dependency 
Ho = "MBA_P of Placed and non-Placed are same"   # Stating the Null Hypothesis
Ha = "MBA_P of Placed and non-Placed are not the same"   # Stating the Alternate Hypothesis

x = df.loc[(df.Status == 'Placed') & (df.PG_Specialization =='Mkt&HR'), "PG_P"].values 
y = df.loc[(df.Status == 'Not Placed') & (df.PG_Specialization == 'Mkt&HR'), "PG_P"].values 

t, p_value  = stats.ttest_ind(x,y, axis = 0)  #Performing an Independent t-test

if p_value < 0.05:  # Setting our significance level at 5%
    print(f'{Ha} as the p_value ({p_value}) < 0.05')
else:
    print(f'{Ho} as the p_value ({p_value}) > 0.05')

#### Incase of Mkt&Fin the PG percentage have more significance on placement compared to Mkt&Hr, but overall Pg percentage is not significant for the job placement.

## Does Etest percentage scored is significant for job placement along with the specializations?

In [None]:
# T-test to check dependency 
Ho = "Etest_P of Placed and non-Placed are same"   # Stating the Null Hypothesis
Ha = "Etest_P of Placed and non-Placed are not the same"   # Stating the Alternate Hypothesis

x = df.loc[df.Status == 'Placed', "Etest_P"].values  
y = df.loc[df.Status == 'Not Placed', "Etest_P"].values 

t, p_value  = stats.ttest_ind(x,y, axis = 0)  #Performing an Independent t-test

if p_value < 0.05:  # Setting our significance level at 5%
    print(f'{Ha} as the p_value ({p_value}) < 0.05')
else:
    print(f'{Ho} as the p_value ({p_value}) > 0.05')

In [None]:
# T-test to check dependency 
Ho = "Etest_P of Placed and non-Placed are same"   # Stating the Null Hypothesis
Ha = "Etest_P of Placed and non-Placed are not the same"   # Stating the Alternate Hypothesis

x = df.loc[(df.Status == 'Placed') & (df.PG_Specialization =='Mkt&HR'), "Etest_P"].values  
y = df.loc[(df.Status == 'Not Placed') & (df.PG_Specialization == 'Mkt&HR'), "Etest_P"].values  

t, p_value  = stats.ttest_ind(x,y, axis = 0)  #Performing an Independent t-test

if p_value < 0.05:  # Setting our significance level at 5%
    print(f'{Ha} as the p_value ({p_value}) < 0.05')
else:
    print(f'{Ho} as the p_value ({p_value}) > 0.05')

In [None]:
# T-test to check dependency 
Ho = "Etest_P of Placed and non-Placed are same"   # Stating the Null Hypothesis
Ha = "Etest_P of Placed and non-Placed are not the same"   # Stating the Alternate Hypothesis

x = df.loc[(df.Status == 'Placed') & (df.PG_Specialization =='Mkt&Fin'), "Etest_P"].values  
y = df.loc[(df.Status == 'Not Placed') & (df.PG_Specialization == 'Mkt&Fin'), "Etest_P"].values 

t, p_value  = stats.ttest_ind(x,y, axis = 0)  #Performing an Independent t-test

if p_value < 0.05:  # Setting our significance level at 5%
    print(f'{Ha} as the p_value ({p_value}) < 0.05')
else:
    print(f'{Ho} as the p_value ({p_value}) > 0.05')

#### Employability Test independently does not have any significance, where as with respect to PG specialization we can see that Etest percentage along with Mkt&Hr percentage have more significance compared Etest percentagae with Mkt&Fin.

# Simulation 

In [None]:
series1 = df.Salary.fillna(value=0)
series2 = df.PG_P
series3 = df.Etest_P


In [None]:

def central_limit_theorem(data,n_samples = 500, sample_size = 100):
    """ Use this function to demonstrate Central Limit Theorem. 
        data = 1D array, or a pd.Series
        n_samples = number of samples to be created
        sample_size = size of the individual sample """
    %matplotlib inline
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    min_value = 0  # minimum index of the data
    max_value = data.count()  # maximum index of the data
    b = {}
    for i in range(n_samples):
        x = np.unique(np.random.randint(min_value, max_value, size = sample_size)) # set of random numbers with a specific size
        b[i] = data[x].mean()   # mean of each sample
    c = pd.DataFrame()
    c['sample'] = b.keys()  # sample number 
    c['Mean'] = b.values()  # mean of that particular sample
    plt.figure(figsize= (15,5))

    plt.subplot(1,2,2)
    sns.distplot(c.Mean)
    plt.title(f"Sampling Distribution. \n \u03bc = {round(c.Mean.mean(), 3)} & SE = {round(c.Mean.std(),3)}")
    plt.xlabel('data')
    plt.ylabel('freq')

    plt.subplot(1,2,1)
    sns.distplot(data)
    plt.title(f"Population Distribution. \n \u03bc = {round(data.mean(), 3)} & \u03C3 = {round(data.std(),3)}")
    plt.xlabel('data')
    plt.ylabel('freq')

    plt.show()

In [None]:
central_limit_theorem(series1,n_samples = 500, sample_size = 100)


In [None]:
central_limit_theorem(series2,n_samples = 500, sample_size = 100)


In [None]:
central_limit_theorem(series3,n_samples = 500, sample_size = 100)


## Regardless of the distribution of the population, the sampling distribution follows the Normal distribution. Hence proving the CLT by Simulation 