importing libraries

In [2]:
import numpy as np
import pandas as pd
from scipy import stats

In [3]:
data = pd.read_csv("student_performance_300.csv")
data.head()

Unnamed: 0,student_id,school,gender,study_hours_week,study_method,attendance_rate,midterm_score,final_score,passed_final,socioeconomic_idx
0,1,C,Male,13.13,Solo,99.44,25.74,30.82,No,0.9
1,2,A,Female,10.65,Group,65.95,26.03,26.69,No,0.2
2,3,C,Male,14.23,Tutor,86.75,33.78,32.63,No,0.34
3,4,C,Male,17.68,Group,98.89,39.71,26.54,No,0.51
4,5,A,Female,14.51,Group,97.3,32.5,28.89,No,0.5


In [4]:
data.describe()

Unnamed: 0,student_id,study_hours_week,attendance_rate,midterm_score,final_score,socioeconomic_idx
count,300.0,300.0,300.0,300.0,300.0,300.0
mean,150.5,12.325333,85.334133,31.307667,36.339,0.5964
std,86.746758,5.273495,9.652695,10.829502,13.05293,0.231585
min,1.0,0.0,58.53,1.3,2.16,0.2
25%,75.75,8.7775,79.2725,24.18,27.495,0.3975
50%,150.5,12.43,85.49,31.72,37.735,0.59
75%,225.25,15.2875,92.6025,38.09,45.17,0.7925
max,300.0,28.27,100.0,59.83,67.41,1.0


In [5]:
data.dtypes

student_id             int64
school                object
gender                object
study_hours_week     float64
study_method          object
attendance_rate      float64
midterm_score        float64
final_score          float64
passed_final          object
socioeconomic_idx    float64
dtype: object

Simple Random Sampling

In [6]:
simple_random_sampling = data.sample(n=6, random_state=90)
print(simple_random_sampling)

     student_id school  gender  study_hours_week study_method  \
83           84      A  Female             14.68        Group   
170         171      B  Female             10.68         Solo   
137         138      C  Female             21.56        Tutor   
164         165      B    Male             16.12         Solo   
242         243      C    Male              4.38         Solo   
162         163      B    Male             12.27        Group   

     attendance_rate  midterm_score  final_score passed_final  \
83             80.26          35.37        42.26           No   
170            81.77          23.39        19.88           No   
137            91.40          25.10        35.98           No   
164            75.63          15.19         9.68           No   
242            80.24          20.34        45.68           No   
162            83.81          24.52        37.94           No   

     socioeconomic_idx  
83                0.42  
170               0.63  
137          

Strata Sampling

In [7]:
strata_simple = []
for name,group in data.groupby(['gender', 'school']):
    k = min(5, len(group))
    strata_simple.append(group.sample(n=k))
print(strata_simple)

[     student_id school  gender  study_hours_week study_method  \
184         185      A  Female             23.71        Tutor   
291         292      A  Female             15.19        Group   
180         181      A  Female             13.88        Tutor   
218         219      A  Female             20.90        Group   
157         158      A  Female             10.05         Solo   

     attendance_rate  midterm_score  final_score passed_final  \
184            87.11          32.48        39.59           No   
291            79.63          28.73        30.70           No   
180            94.19          19.83        14.67           No   
218            80.88          41.29        44.14           No   
157            75.05          21.25        28.21           No   

     socioeconomic_idx  
184               0.63  
291               0.98  
180               0.51  
218               0.28  
157               0.68  ,      student_id school  gender  study_hours_week study_method  \
1

In [8]:
df = pd.concat(strata_simple)
df

Unnamed: 0,student_id,school,gender,study_hours_week,study_method,attendance_rate,midterm_score,final_score,passed_final,socioeconomic_idx
184,185,A,Female,23.71,Tutor,87.11,32.48,39.59,No,0.63
291,292,A,Female,15.19,Group,79.63,28.73,30.7,No,0.98
180,181,A,Female,13.88,Tutor,94.19,19.83,14.67,No,0.51
218,219,A,Female,20.9,Group,80.88,41.29,44.14,No,0.28
157,158,A,Female,10.05,Solo,75.05,21.25,28.21,No,0.68
143,144,B,Female,18.78,Solo,86.54,56.15,60.33,Yes,0.73
237,238,B,Female,8.88,Group,71.37,16.63,30.13,No,0.58
76,77,B,Female,8.38,Solo,90.45,23.03,31.5,No,0.71
296,297,B,Female,10.63,Solo,69.56,34.11,37.52,No,0.59
67,68,B,Female,14.29,Solo,94.75,30.75,37.73,No,0.93


In [9]:
df.to_csv("strata_simple.csv")

Descriptive Statistics

In [10]:
numerical = ["study_hours_week", "attendance_rate", "midterm_score", "final_score"]
row =[]
for i in numerical:
    s = data[i]
    rows = {
        "variable": i,
        "count" : len(s),
        "mean" : s.mean(),
        "median" : s.median(),
        "variance" : s.var(),
        "standard_deviation" : s.std(),
        "minimum" : s.min(),
        "maximum" : s.max(),
        "range" : s.max() - s.min(),
        "q1" : s.quantile(0.25),
        "q2" : s.quantile(0.50),
        "q3" : s.quantile(0.75),
        "iqr" : s.quantile(0.25) - s.quantile(0.75),
        "skewness" : s.skew()
    }
    row.append(rows)
descriptive_statics = pd.DataFrame(row)

In [11]:
print(descriptive_statics)

           variable  count       mean  median    variance  standard_deviation  \
0  study_hours_week    300  12.325333  12.430   27.809755            5.273495   
1   attendance_rate    300  85.334133  85.490   93.174520            9.652695   
2     midterm_score    300  31.307667  31.720  117.278110           10.829502   
3       final_score    300  36.339000  37.735  170.378982           13.052930   

   minimum  maximum  range       q1      q2       q3     iqr  skewness  
0     0.00    28.27  28.27   8.7775  12.430  15.2875  -6.510  0.121889  
1    58.53   100.00  41.47  79.2725  85.490  92.6025 -13.330 -0.340541  
2     1.30    59.83  58.53  24.1800  31.720  38.0900 -13.910 -0.074325  
3     2.16    67.41  65.25  27.4950  37.735  45.1700 -17.675 -0.182267  


In [12]:
descriptive_statics.to_csv("desc_stats.csv",index=False)

OutLier Detection using IQR

In [13]:
for i in numerical:
    p = df[i]
    q1 = p.quantile(0.25)
    q3 = p.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5*iqr
    upper = q1 + 1.5*iqr
    outliers = p[(s<lower)|(s>upper)]
print(outliers)

143    60.33
135    49.91
89     53.65
35     53.77
79     62.16
Name: final_score, dtype: float64


Outlier Detection using z score

In [None]:
from scipy.stats import chi2_contingency,f_oneway,ttest_ind,ttest_rel,pearsonr

In [19]:
for i in numerical:
    p = df[i]
    outlier_z = np.abs(stats.zscore(p))
    z = p[outlier_z>3]
print(z)

Series([], Name: final_score, dtype: float64)


hypothesis testing
