In [3]:
#importing libraries
import numpy as np
import pandas as pd
import scipy.stats as stats

In [4]:
#descriptive statistics
data = np.array([10,20,30,40,50,60,70])
mean = np.mean(data)
median = np.median(data)
mode = stats.mode(data)
variance = np.var(data)

print(mean)
print(median)
print(mode)
print(variance)

40.0
40.0
ModeResult(mode=10, count=1)
400.0


In [5]:
#inferential statistics 
confidence_level = 0.95
degrees_freedom = len(data)-1
confidence_interval = stats.t.interval(confidence_level,degrees_freedom,mean, stats.sem(data))
print(confidence_level)

0.95


In [9]:
#hypothesis testing [exists a significant difference]
np.random.seed(42)
groupA = np.random.normal(50,10,100)
groupB = np.random.normal(53,10,100)
groupC = np.random.normal(51,10,100)
t_stat,p_value = stats.ttest_ind(groupA,groupB)
#use ANOVA(Analysis of Variance) for testing more than one hypothesis
f_value, p_value = stats.f_oneway(groupA,groupB,groupC)

print(t_stat)
print(p_value)

-3.2359903436078983
0.0014208821931449166


In [11]:
#Data Wrangling & Preprosessing - accertain data quality 

import pandas as pd

data1 = {
    "Age" : [25,26, np.nan,29,22,34],
    "Salary" : [5000,5400,5800,None,5900,6000],
    "Gender" : ['Male','Female','Male','Female','Male','Female'],
    "Years_at_company" : [1,2,3,np.nan,2,8]
}
df = pd.DataFrame(data1)
print(df)



    Age  Salary  Gender  Years_at_company
0  25.0  5000.0    Male               1.0
1  26.0  5400.0  Female               2.0
2   NaN  5800.0    Male               3.0
3  29.0     NaN  Female               NaN
4  22.0  5900.0    Male               2.0
5  34.0  6000.0  Female               8.0


In [13]:
#missing values
missing = df.isnull().sum()
missing

Age                 1
Salary              1
Gender              0
Years_at_company    1
dtype: int64

In [15]:
#use simple imputer from skitlearn to handle missing values

from sklearn.impute import SimpleImputer
num_imputer = SimpleImputer(strategy="median")
df['Age'] = num_imputer.fit_transform(df[['Age']])
df['Years_at_company'] = num_imputer.fit_transform(df[['Years_at_company']])

In [16]:
#impute categorical variable
cat_imputer = SimpleImputer(strategy = 'most_frequent')
df['Salary'] = cat_imputer.fit_transform(df[['Salary']])
df

Unnamed: 0,Age,Salary,Gender,Years_at_company
0,25.0,5000.0,Male,1.0
1,26.0,5400.0,Female,2.0
2,26.0,5800.0,Male,3.0
3,29.0,5000.0,Female,2.0
4,22.0,5900.0,Male,2.0
5,34.0,6000.0,Female,8.0


In [17]:
#feature engineering - feature that help improve quality of data
df['Age_years_at_company_interaction'] = df['Age']*df['Years_at_company']
df['Age_years_at_company_interaction']


0     25.0
1     52.0
2     78.0
3     58.0
4     44.0
5    272.0
Name: Age_years_at_company_interaction, dtype: float64

In [27]:
df1 = pd.DataFrame({'Employee':['John', 'Makara'],
                    'Dept' : ['Finance', 'Data']})
df2 = pd.DataFrame({'Employee' :['John', 'Makara'],
                    'Project' :['Budget', 'Billing']})
print(df1,df2)
                    

  Employee     Dept
0     John  Finance
1   Makara     Data   Employee  Project
0     John   Budget
1   Makara  Billing


In [29]:
#merging data frames on conditions
merge_df = pd.merge(df1,df2, on = 'Employee', how='left')
pivot_table = df1.pivot(index='Employee', columns='Dept', values = 'Dept')
pivot_table.fillna(0, inplace=True)
merge_df

Unnamed: 0,Employee,Dept,Project
0,John,Finance,Budget
1,Makara,Data,Billing
