# Finding the Meaning

## Computing Aggregate Statistics
Can obtain summary statistics by using the describe function.

In [30]:
# create the dataframe
import pandas as pd
names = ["Bob","Jessica","Mary","John","Mel"]
grades = [76,95,77,78,99]
GradeList = list(zip(names,grades))
df = pd.DataFrame(data=GradeList, columns = ["Names","Grades"])
df

Unnamed: 0,Names,Grades
0,Bob,76
1,Jessica,95
2,Mary,77
3,John,78
4,Mel,99


In [7]:
# count (number of values)
df['Grades'].count()

4

In [9]:
# arithmetic average
df['Grades'].mean()

85.0

In [11]:
# standard deviation
df['Grades'].std()

11.067971810589327

In [13]:
# minimum value
df['Grades'].min()

76

In [14]:
# maximum value
df['Grades'].max()

99

In [15]:
# first quartile
df['Grades'].quantile(.25)

77.0

In [18]:
# second quartile
df['Grades'].quantile(.5)

78.0

In [20]:
# third quartile
df['Grades'].quantile(0.75)

95.0

In [21]:
# measuring central tendency
# the mean
df['Grades'].mean()

85.0

In [22]:
# median
df['Grades'].median() # same as quantile(0.5)

78.0

In [31]:
# mode
df['Grades'].mode() # not very useful since each value only occurs once

0    76
1    77
2    78
3    95
4    99
dtype: int64

In [33]:
# variance
df['Grades'].var()

122.5

If you do not specify which column to use to calculate the aggregrate statistics, Python will just run the function on all applicable columns.

In [35]:
# computing variance on all numeric columns in the dataframe
df.var()

Grades    122.5
dtype: float64

In [3]:
# Practice - computing aggregate statistics with multiple columns
import pandas as pd
# creating the dataset
names = ["Bob","Jessica","Mary","John","Mel"]
grades = [76,95,77,78,99]
bsdegrees = [1,1,0,0,1]
msdegrees = [2,1,0,0,0]
phddegrees = [0,1,0,0,0]
GradeList = list(zip(names,grades,bsdegrees,msdegrees,phddegrees))
df = pd.DataFrame(data=GradeList,
                 columns = ["Names","Grades","BS","MS","PhD"])
df.head()

Unnamed: 0,Names,Grades,BS,MS,PhD
0,Bob,76,1,2,0
1,Jessica,95,1,1,1
2,Mary,77,0,0,0
3,John,78,0,0,0
4,Mel,99,1,0,0


In [4]:
# BS - mean
df["BS"].mean()

0.6

In [5]:
# BS - median
df["BS"].median()

1.0

In [6]:
# BS - standard deviation
df["BS"].std()

0.5477225575051662

In [7]:
# BS - variance
df["BS"].var()

0.30000000000000004

In [8]:
# BS - first quartile
df["BS"].quantile(0.25)

0.0

In [9]:
# BS - third quartile
df["BS"].quantile(0.75)

1.0

In [10]:
# BS - max (other format) 
df.BS.max()

1

In [12]:
# BS - min (other format)
df.BS.min()

0

In [13]:
# MS - mean
df.MS.mean()

0.6

In [14]:
# MS - median
df.MS.median()

0.0

In [15]:
# MS - standard deviation
df.MS.std()

0.8944271909999157

In [17]:
# MS - variance
df.MS.var()

0.7999999999999998

In [18]:
# MS - first quartile
df.MS.quantile(0.25)

0.0

In [19]:
# MS - third quartile
df.MS.quantile(0.75)

1.0

In [20]:
# MS - min
df.MS.min()

0

In [21]:
# MS - max
df.MS.max()

2

In [22]:
# PhD - mean
df.PhD.mean()

0.2

In [23]:
# PhD - median
df.PhD.median()

0.0

In [24]:
# PhD - standard deviation
df.PhD.std()

0.44721359549995804

In [25]:
# PhD - variance
df.PhD.var()

0.20000000000000007

In [26]:
# PhD - first quartile
df.PhD.quantile(0.25)

0.0

In [28]:
# PhD - third quantile
df.PhD.quantile(0.75)

0.0

In [29]:
# PhD - min
df.PhD.min()

0

In [30]:
# PhD - max
df.PhD.max()

1

## Computing Aggregate Statistics on Matching Rows

In [31]:
import pandas as pd
# create the dataframe
names = ["Bob","Jessica","Mary","John","Mel"]
grades = [76,95,77,78,99]
bs = [1,1,0,0,1]
ms = [2,1,0,0,0]
phd = [0,1,0,0,0]
GradeList = list(zip(names,grades,bs,ms,phd))
df = pd.DataFrame(data=GradeList,
                 columns=["Names","Grades","BS","MS","PhD"])
df

Unnamed: 0,Names,Grades,BS,MS,PhD
0,Bob,76,1,2,0
1,Jessica,95,1,1,1
2,Mary,77,0,0,0
3,John,78,0,0,0
4,Mel,99,1,0,0


In [36]:
# count the rows of people without a phd
df.loc[df.PhD == 0].count()

Names     4
Grades    4
BS        4
MS        4
PhD       4
dtype: int64

In [39]:
# computing aggregate statistics for a column
# using rows that meat a certain condition
df.loc[df.PhD == 0].Grades.mean() # find the average grade for people
                        # without a PhD

82.5

In [42]:
# Practice - find the average grade for people with masters degrees
df.loc[df.MS > 0].Grades.mean()

85.5

In [44]:
# mean grade for people without a bachelors degree
df.loc[df.BS == 0].Grades.mean()

77.5

## Sorting Data
Often, we get data in a random order, but need it to be sorted in order to perform our analyses.

In [47]:
# loading data
import pandas as pd
Location = "datasets/gradedata.csv"
df = pd.read_csv(Location)
df.head()

Unnamed: 0,fname,lname,gender,age,exercise,hours,grade,address
0,Marcia,Pugh,female,17,3,10,82.4,"9253 Richardson Road, Matawan, NJ 07747"
1,Kadeem,Morrison,male,18,4,4,78.2,"33 Spring Dr., Taunton, MA 02780"
2,Nash,Powell,male,18,5,9,79.3,"41 Hill Avenue, Mentor, OH 44060"
3,Noelani,Wagner,female,14,2,7,83.2,"8839 Marshall St., Miami, FL 33125"
4,Noelani,Cherry,female,18,4,15,87.4,"8304 Charles Rd., Lewis Center, OH 43035"


In [50]:
# sort the dataframe's rows by age (descending)
# soft_values()
df = df.sort_values(by="age",ascending = False)
df.head()

Unnamed: 0,fname,lname,gender,age,exercise,hours,grade,address
1000,Hanna,Mooney,female,19,2,10,88.8,"8293 SW. Cedar Swamp Lane, Union, NJ 07083"
868,Craig,Moses,male,19,4,19,100.0,"7218 Grove Rd., Melbourne, FL 32904"
822,Rinah,Jacobson,female,19,1,7,80.8,"737 Amherst Court, Amsterdam, NY 12010"
826,Hayes,Wilkinson,male,19,5,3,76.0,"350 Temple Court, Mason City, IA 50401"
831,Noah,Mckay,male,19,5,8,74.8,"9434 Carriage Ave., Cordova, TN 38016"


In [52]:
# sort rows by hours of study and exercise, ascending
df = df.sort_values(by=["exercise","hours"],ascending=[True,True])
df.head()

Unnamed: 0,fname,lname,gender,age,exercise,hours,grade,address
1278,Barclay,Nichols,male,19,0,2,66.3,"9999 Longbranch St., Wayne, NJ 07470"
1004,Cheyenne,Walsh,female,15,0,10,74.6,"79 E. 2nd St., Hollis, NY 11423"
1637,Griffin,Burch,male,17,0,11,83.0,"277 Carson Drive, Pewaukee, WI 53072"
908,Mona,Gilliam,female,17,0,11,89.5,"8436 South Temple St., Natick, MA 01760"
1226,Magee,Berger,male,14,0,13,91.1,"137 West Beach St., Hamden, CT 06514"


In [54]:
# Practice - sort the dataframe by name, age, and grade
# ascending
df = df.sort_values(by=["lname","age","grade"], ascending = True)
df.head()

Unnamed: 0,fname,lname,gender,age,exercise,hours,grade,address
1439,Levi,Acevedo,male,14,2,6,63.0,"87 Cooper Ave., Westland, MI 48185"
1770,Leandra,Acevedo,female,14,1,15,94.4,"51 Central Drive, South Windsor, CT 06074"
762,Rhiannon,Acevedo,female,15,3,10,72.7,"451 Pendergast Street, Bartlett, IL 60103"
819,Athena,Acevedo,female,15,4,12,90.0,"8392 Euclid Ave., Miami, FL 33125"
1458,Cedric,Acevedo,male,19,5,14,98.8,"50 Vine Lane, Derby, KS 67037"


In [56]:
df = df.sort_values(by=["lname","age","grade"], ascending = False)
df.head()

Unnamed: 0,fname,lname,gender,age,exercise,hours,grade,address
210,Keith,Zimmerman,male,14,3,10,86.0,"9002 Lakewood Ave., Fort Washington, MD 20744"
631,Noble,Zamora,male,17,1,14,91.7,"892 Howard Ave., Bloomington, IN 47401"
978,Ursula,York,female,16,2,13,86.7,"866 Circle Ave., Deland, FL 32720"
779,Allen,York,male,15,3,10,80.0,"7750 Queen Dr., Dallas, GA 30132"
1680,Clayton,Yates,male,16,5,14,100.0,"9528 Miller Drive, Klamath Falls, OR 97603"


## Correlation
Correlation refers broadly to statistical relationships involving dependence. Most commonly, it refers to the extent to which two variables have a linear relationship with each other. Examples include the relationship between parents' height and children's height (which we expect to have a strong, positive correlation) and the relationship between price and demaind (which we expect to have a strong, negative correlation).

In [57]:
# Running a correlation
import pandas as pd
# read in the data
Location = "datasets/gradedata.csv"
df = pd.read_csv(Location)
df.head()

Unnamed: 0,fname,lname,gender,age,exercise,hours,grade,address
0,Marcia,Pugh,female,17,3,10,82.4,"9253 Richardson Road, Matawan, NJ 07747"
1,Kadeem,Morrison,male,18,4,4,78.2,"33 Spring Dr., Taunton, MA 02780"
2,Nash,Powell,male,18,5,9,79.3,"41 Hill Avenue, Mentor, OH 44060"
3,Noelani,Wagner,female,14,2,7,83.2,"8839 Marshall St., Miami, FL 33125"
4,Noelani,Cherry,female,18,4,15,87.4,"8304 Charles Rd., Lewis Center, OH 43035"


In [58]:
# finding the correlation
df.corr()
# highest absolute values - most correlated
# lowest absolute values - least correlated

Unnamed: 0,age,exercise,hours,grade
age,1.0,-0.003643,-0.017467,-0.00758
exercise,-0.003643,1.0,0.021105,0.161286
hours,-0.017467,0.021105,1.0,0.801955
grade,-0.00758,0.161286,0.801955,1.0


In [59]:
# Practice - using the parkinson's dataset
import pandas as pd
Location = "datasets/parkinsons.csv"
df = pd.read_csv(Location)
df.head()

Unnamed: 0,Participant_code,Age,Gender,Fam_History_Parkinsons,Age_of_onset,Duration_from_first_symptoms,Antidepressants,Antiparkinsonian_meds,Antipsychotic_meds,Benzodiazepine_meds,...,AccelerationSpeechTiming,DurationPauseIntervals,DurationVoicedIntervals,GapingBTVoicedInterval,DurationUnvoicedStops,DecayUnvoicedFricatives,RelativeLoudnessRespiration,PauseIntPerResp,RateSpeechRespiration,LatencyRespExchange
0,PD,58,F,No,56.0,2.0,No,No,No,No,...,-2.82,158,318,49.01,22.37,0.588,-19.77,6.0,13.81,127
1,PD,68,F,No,67.0,1.0,No,No,No,No,...,8.2,295,264,40.56,26.88,-0.825,-23.26,4.0,21.77,313
2,PD,68,M,No,67.0,1.0,No,No,No,No,...,4.71,280,317,48.97,22.37,-0.955,-13.29,4.0,22.52,201
3,PD,75,M,No,73.0,2.0,No,No,No,No,...,-9.09,397,800,18.69,49.37,0.791,-25.08,2.0,14.37,151
4,PD,61,M,Yes,60.0,0.7,No,No,No,No,...,11.77,206,480,33.54,26.87,0.075,-22.32,5.0,14.61,151


In [60]:
# find the correlation between the variables
df.corr() # big data frame since there are a lot of variables

Unnamed: 0,Age,Age_of_onset,Duration_from_first_symptoms,Levodopa_equivalent,Clonazepam,EntropySpeechTiming,RateSpeechTiming,AccelerationSpeechTiming,DurationPauseIntervals,DurationVoicedIntervals,GapingBTVoicedInterval,DurationUnvoicedStops,DecayUnvoicedFricatives,RelativeLoudnessRespiration,PauseIntPerResp,RateSpeechRespiration,LatencyRespExchange
Age,1.0,0.924583,-0.043365,,-0.181748,0.006161,-0.047259,0.089541,0.023392,0.124064,0.105468,0.07245,-0.211831,-0.045692,-0.026765,0.016729,-0.027783
Age_of_onset,0.924583,1.0,-0.420354,,-0.401601,0.067829,-0.102134,0.104441,0.085748,0.179408,0.059391,0.0465,-0.168093,0.027061,0.052069,-0.044968,0.011309
Duration_from_first_symptoms,-0.043365,-0.420354,1.0,,0.293918,-0.134221,0.011489,-0.086331,-0.078771,-0.065264,0.00105,0.121795,-0.074541,0.016002,-0.07749,0.069802,-0.022594
Levodopa_equivalent,,,,,,,,,,,,,,,,,
Clonazepam,-0.181748,-0.401601,0.293918,,1.0,-0.791914,-0.69898,-0.225714,0.949982,0.322861,-0.8229,0.596103,-0.268905,-0.46904,-0.645692,0.546634,-0.173018
EntropySpeechTiming,0.006161,0.067829,-0.134221,,-0.791914,1.0,0.392359,-0.02053,-0.402679,-0.389259,0.739073,-0.53731,-0.043771,0.260271,0.532772,-0.09404,0.023395
RateSpeechTiming,-0.047259,-0.102134,0.011489,,-0.69898,0.392359,1.0,0.042123,-0.778113,-0.773582,0.589514,-0.635871,-0.095244,0.392791,0.625879,-0.251247,-0.293842
AccelerationSpeechTiming,0.089541,0.104441,-0.086331,,-0.225714,-0.02053,0.042123,1.0,-0.002407,-0.064614,-0.004085,-0.061647,-0.147714,-0.131518,-0.029167,-0.043568,-0.006692
DurationPauseIntervals,0.023392,0.085748,-0.078771,,0.949982,-0.402679,-0.778113,-0.002407,1.0,0.43825,-0.541767,0.628998,0.010327,-0.396658,-0.635359,0.380658,0.562429
DurationVoicedIntervals,0.124064,0.179408,-0.065264,,0.322861,-0.389259,-0.773582,-0.064614,0.43825,1.0,-0.567734,0.475179,0.155827,-0.334646,-0.38055,-0.03985,-0.033681


## Regression
Use regression to create an equation that explains or predicts a dependent variable based on one or more independent variables.

In [61]:
# load data from a csv
import pandas as pd
Location = "datasets/gradedata.csv"
df = pd.read_csv(Location)
df.head()

Unnamed: 0,fname,lname,gender,age,exercise,hours,grade,address
0,Marcia,Pugh,female,17,3,10,82.4,"9253 Richardson Road, Matawan, NJ 07747"
1,Kadeem,Morrison,male,18,4,4,78.2,"33 Spring Dr., Taunton, MA 02780"
2,Nash,Powell,male,18,5,9,79.3,"41 Hill Avenue, Mentor, OH 44060"
3,Noelani,Wagner,female,14,2,7,83.2,"8839 Marshall St., Miami, FL 33125"
4,Noelani,Cherry,female,18,4,15,87.4,"8304 Charles Rd., Lewis Center, OH 43035"


The next step is to decide which columns we want to be our independent variable and which one we want to be our dependent variable.

In this example, we will try to predict grade based on age, hours of exercise, and hours of study.

In [64]:
# first example
import statsmodels.formula.api as sm
result = sm.ols(
    formula='grade ~ age + exercise + hours', data = df).fit()
# equation - dv ~ iv1 + iv2 + iv3 ...
result.summary()

0,1,2,3
Dep. Variable:,grade,R-squared:,0.664
Model:,OLS,Adj. R-squared:,0.664
Method:,Least Squares,F-statistic:,1315.0
Date:,"Tue, 21 May 2019",Prob (F-statistic):,0.0
Time:,10:25:22,Log-Likelihood:,-6300.7
No. Observations:,2000,AIC:,12610.0
Df Residuals:,1996,BIC:,12630.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,57.8704,1.321,43.804,0.000,55.279,60.461
age,0.0397,0.075,0.532,0.595,-0.107,0.186
exercise,0.9893,0.089,11.131,0.000,0.815,1.164
hours,1.9165,0.031,61.564,0.000,1.855,1.978

0,1,2,3
Omnibus:,321.187,Durbin-Watson:,2.047
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2196.187
Skew:,-0.567,Prob(JB):,0.0
Kurtosis:,8.007,Cond. No.,213.0


Since we notice that the p-value of the t test for significance for the slope of age in the model is far from significant, we will rerun the analysis without this independent variable.

In [65]:
import statsmodels.formula.api as sm
result = sm.ols(
    formula = 'grade ~ hours + exercise',
    data = df).fit()
result.summary()

0,1,2,3
Dep. Variable:,grade,R-squared:,0.664
Model:,OLS,Adj. R-squared:,0.664
Method:,Least Squares,F-statistic:,1973.0
Date:,"Tue, 21 May 2019",Prob (F-statistic):,0.0
Time:,10:35:44,Log-Likelihood:,-6300.8
No. Observations:,2000,AIC:,12610.0
Df Residuals:,1997,BIC:,12620.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,58.5316,0.447,130.828,0.000,57.654,59.409
hours,1.9162,0.031,61.575,0.000,1.855,1.977
exercise,0.9892,0.089,11.131,0.000,0.815,1.163

0,1,2,3
Omnibus:,318.721,Durbin-Watson:,2.048
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2158.0
Skew:,-0.564,Prob(JB):,0.0
Kurtosis:,7.962,Cond. No.,43.2


In [69]:
# Practice - converting gender to numeric, and using it as an IV
# reading in the data
import pandas as pd
Location = "datasets/gradedata.csv"
df = pd.read_csv(Location)
df.head()
# convert gender to numeric
# create a function
def gender_to_num(x):
    if x == "female":
        return 0
    if x == "male":
        return 1

df.gender = df.gender.apply(gender_to_num)
df.head()

Unnamed: 0,fname,lname,gender,age,exercise,hours,grade,address
0,Marcia,Pugh,0,17,3,10,82.4,"9253 Richardson Road, Matawan, NJ 07747"
1,Kadeem,Morrison,1,18,4,4,78.2,"33 Spring Dr., Taunton, MA 02780"
2,Nash,Powell,1,18,5,9,79.3,"41 Hill Avenue, Mentor, OH 44060"
3,Noelani,Wagner,0,14,2,7,83.2,"8839 Marshall St., Miami, FL 33125"
4,Noelani,Cherry,0,18,4,15,87.4,"8304 Charles Rd., Lewis Center, OH 43035"


In [70]:
# run the regression
import statsmodels.formula.api as sm
result = sm.ols(
    formula = 'grade ~ hours + exercise + gender',
    data=df).fit()
result.summary()

0,1,2,3
Dep. Variable:,grade,R-squared:,0.665
Model:,OLS,Adj. R-squared:,0.664
Method:,Least Squares,F-statistic:,1318.0
Date:,"Tue, 21 May 2019",Prob (F-statistic):,0.0
Time:,10:46:43,Log-Likelihood:,-6299.3
No. Observations:,2000,AIC:,12610.0
Df Residuals:,1996,BIC:,12630.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,58.7621,0.466,126.172,0.000,57.849,59.676
hours,1.9170,0.031,61.628,0.000,1.856,1.978
exercise,0.9840,0.089,11.073,0.000,0.810,1.158
gender,-0.4476,0.253,-1.770,0.077,-0.943,0.048

0,1,2,3
Omnibus:,322.999,Durbin-Watson:,2.048
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2244.206
Skew:,-0.566,Prob(JB):,0.0
Kurtosis:,8.064,Cond. No.,45.6


We can see that adding gender to our model slightly improves the model's R-square value (goes from 0.664 to 0.665).

## Regression without Intercept

Only want to do this when we KNOW FOR SURE that the true intercept of 
the data should be 0 (that the model should pass through the point (0,0)). 
Basically, we are coercing the model to go through the point (0,0), and then finding the relevant regression information of this model. Lets give it a shot below and see how it is done.

In [72]:
# running regression without intercept
import pandas as pd
# read in the data
Location = "datasets/gradedata.csv"
df = pd.read_csv(Location)
df.head()

Unnamed: 0,fname,lname,gender,age,exercise,hours,grade,address
0,Marcia,Pugh,female,17,3,10,82.4,"9253 Richardson Road, Matawan, NJ 07747"
1,Kadeem,Morrison,male,18,4,4,78.2,"33 Spring Dr., Taunton, MA 02780"
2,Nash,Powell,male,18,5,9,79.3,"41 Hill Avenue, Mentor, OH 44060"
3,Noelani,Wagner,female,14,2,7,83.2,"8839 Marshall St., Miami, FL 33125"
4,Noelani,Cherry,female,18,4,15,87.4,"8304 Charles Rd., Lewis Center, OH 43035"


In [74]:
# create the model
import statsmodels.formula.api as sm
result = sm.ols(
    formula = 'grade ~ age + exercise + hours - 1',
    # the - 1 at the end indicates we want to get rid of
    # the intercept
    data = df).fit()
result.summary()

0,1,2,3
Dep. Variable:,grade,R-squared:,0.991
Model:,OLS,Adj. R-squared:,0.991
Method:,Least Squares,F-statistic:,72840.0
Date:,"Tue, 21 May 2019",Prob (F-statistic):,0.0
Time:,11:07:54,Log-Likelihood:,-6974.3
No. Observations:,2000,AIC:,13950.0
Df Residuals:,1997,BIC:,13970.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,3.1129,0.035,88.030,0.000,3.044,3.182
exercise,1.7659,0.122,14.482,0.000,1.527,2.005
hours,2.2860,0.042,54.486,0.000,2.204,2.368

0,1,2,3
Omnibus:,131.221,Durbin-Watson:,2.006
Prob(Omnibus):,0.0,Jarque-Bera (JB):,403.367
Skew:,-0.301,Prob(JB):,2.5700000000000003e-88
Kurtosis:,5.116,Cond. No.,14.2


In [76]:
# Practice
import pandas as pd
Location = "datasets/gradedata.csv"
df = pd.read_csv(Location)
df.head()

Unnamed: 0,fname,lname,gender,age,exercise,hours,grade,address
0,Marcia,Pugh,female,17,3,10,82.4,"9253 Richardson Road, Matawan, NJ 07747"
1,Kadeem,Morrison,male,18,4,4,78.2,"33 Spring Dr., Taunton, MA 02780"
2,Nash,Powell,male,18,5,9,79.3,"41 Hill Avenue, Mentor, OH 44060"
3,Noelani,Wagner,female,14,2,7,83.2,"8839 Marshall St., Miami, FL 33125"
4,Noelani,Cherry,female,18,4,15,87.4,"8304 Charles Rd., Lewis Center, OH 43035"


In [87]:
# test for the relationship between just grade and age (intercept)
import statsmodels.formula.api as sm
result = sm.ols(
    formula = 'grade ~ age', data=df).fit()
result.summary()

0,1,2,3
Dep. Variable:,grade,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.1148
Date:,"Tue, 21 May 2019",Prob (F-statistic):,0.735
Time:,11:37:13,Log-Likelihood:,-7391.4
No. Observations:,2000,AIC:,14790.0
Df Residuals:,1998,BIC:,14800.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,83.2782,2.142,38.871,0.000,79.077,87.480
age,-0.0436,0.129,-0.339,0.735,-0.296,0.209

0,1,2,3
Omnibus:,18.757,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,18.432
Skew:,-0.212,Prob(JB):,9.94e-05
Kurtosis:,2.797,Cond. No.,164.0


In [89]:
# test for the relationship between just grade and age (no intercept)
import statsmodels.formula.api as sm
result = sm.ols(
    formula = 'grade ~ age - 1', data=df).fit()
result.summary()

0,1,2,3
Dep. Variable:,grade,R-squared:,0.976
Model:,OLS,Adj. R-squared:,0.976
Method:,Least Squares,F-statistic:,80830.0
Date:,"Tue, 21 May 2019",Prob (F-statistic):,0.0
Time:,11:38:34,Log-Likelihood:,-7954.5
No. Observations:,2000,AIC:,15910.0
Df Residuals:,1999,BIC:,15920.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,4.9277,0.017,284.306,0.000,4.894,4.962

0,1,2,3
Omnibus:,5.034,Durbin-Watson:,1.952
Prob(Omnibus):,0.081,Jarque-Bera (JB):,5.089
Skew:,-0.114,Prob(JB):,0.0785
Kurtosis:,2.906,Cond. No.,1.0


In [90]:
# test for the relationship between just grade and exercise (intercept)
import statsmodels.formula.api as sm
result = sm.ols(
    formula = 'grade ~ exercise', data=df).fit()
result.summary()

0,1,2,3
Dep. Variable:,grade,R-squared:,0.026
Model:,OLS,Adj. R-squared:,0.026
Method:,Least Squares,F-statistic:,53.36
Date:,"Tue, 21 May 2019",Prob (F-statistic):,3.99e-13
Time:,11:38:54,Log-Likelihood:,-7365.1
No. Observations:,2000,AIC:,14730.0
Df Residuals:,1998,BIC:,14750.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,79.2415,0.502,157.800,0.000,78.257,80.226
exercise,1.1047,0.151,7.305,0.000,0.808,1.401

0,1,2,3
Omnibus:,18.201,Durbin-Watson:,2.008
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17.607
Skew:,-0.202,Prob(JB):,0.00015
Kurtosis:,2.78,Cond. No.,8.33


In [93]:
# test for the relationship between just grade and exercise (no intercept)
import statsmodels.formula.api as sm
result = sm.ols(
    formula = 'grade ~ exercise - 1', data=df).fit()
result.summary()

0,1,2,3
Dep. Variable:,grade,R-squared:,0.82
Model:,OLS,Adj. R-squared:,0.82
Method:,Least Squares,F-statistic:,9094.0
Date:,"Tue, 21 May 2019",Prob (F-statistic):,0.0
Time:,11:40:04,Log-Likelihood:,-9965.0
No. Observations:,2000,AIC:,19930.0
Df Residuals:,1999,BIC:,19940.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
exercise,22.6657,0.238,95.363,0.000,22.200,23.132

0,1,2,3
Omnibus:,551.975,Durbin-Watson:,1.654
Prob(Omnibus):,0.0,Jarque-Bera (JB):,90.877
Skew:,0.044,Prob(JB):,1.85e-20
Kurtosis:,1.96,Cond. No.,1.0


In [97]:
# test for the relationship between just grade and study (intercept)
import statsmodels.formula.api as sm
result = sm.ols(
    formula = 'grade ~ hours', data=df).fit()
result.summary()

0,1,2,3
Dep. Variable:,grade,R-squared:,0.643
Model:,OLS,Adj. R-squared:,0.643
Method:,Least Squares,F-statistic:,3601.0
Date:,"Tue, 21 May 2019",Prob (F-statistic):,0.0
Time:,11:41:45,Log-Likelihood:,-6361.0
No. Observations:,2000,AIC:,12730.0
Df Residuals:,1998,BIC:,12740.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,61.4193,0.376,163.544,0.000,60.683,62.156
hours,1.9235,0.032,60.006,0.000,1.861,1.986

0,1,2,3
Omnibus:,303.363,Durbin-Watson:,2.041
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1929.064
Skew:,-0.545,Prob(JB):,0.0
Kurtosis:,7.686,Cond. No.,34.0


In [100]:
# test for the relationship between just grade and study (no intercept)
import statsmodels.formula.api as sm
result = sm.ols(
    formula = 'grade ~ hours - 1', data=df).fit()
result.summary()

0,1,2,3
Dep. Variable:,grade,R-squared:,0.929
Model:,OLS,Adj. R-squared:,0.929
Method:,Least Squares,F-statistic:,26330.0
Date:,"Tue, 21 May 2019",Prob (F-statistic):,0.0
Time:,11:44:07,Log-Likelihood:,-9027.3
No. Observations:,2000,AIC:,18060.0
Df Residuals:,1999,BIC:,18060.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
hours,6.8407,0.042,162.273,0.000,6.758,6.923

0,1,2,3
Omnibus:,7.01,Durbin-Watson:,1.794
Prob(Omnibus):,0.03,Jarque-Bera (JB):,5.684
Skew:,-0.032,Prob(JB):,0.0583
Kurtosis:,2.747,Cond. No.,1.0


Based off of these results, I would chose the model predicting grade based on the independent variable hours of studying. I would chose the model that includes the intercept. I would chose a model that includes the intercept because, for each of the models, it does not make practical sense to coerce the y-intercept to be 0. Therefore, since we do not want to bias our results, we will include the intercept in our model. Out of all the models that included the intercept, the model with hours of study as the independent variable had the highest R-Square value. Therefore, I believe this is the best model.

## Creating Basic Pivot Tables

A pivot table is a table of statistics that summarizes data from a more extensive table. The summary can include sums, averages, and other statistics. Pivot tables are a useful data processing tool because they allow analysts to rearrage (or "pivot") statistics in order to draw attention to/explore what they are interested in.

In [102]:
# load data from CSV
import pandas as pd
Location = "datasets/gradedata.csv"
df = pd.read_csv(Location)
df.head()

Unnamed: 0,fname,lname,gender,age,exercise,hours,grade,address
0,Marcia,Pugh,female,17,3,10,82.4,"9253 Richardson Road, Matawan, NJ 07747"
1,Kadeem,Morrison,male,18,4,4,78.2,"33 Spring Dr., Taunton, MA 02780"
2,Nash,Powell,male,18,5,9,79.3,"41 Hill Avenue, Mentor, OH 44060"
3,Noelani,Wagner,female,14,2,7,83.2,"8839 Marshall St., Miami, FL 33125"
4,Noelani,Cherry,female,18,4,15,87.4,"8304 Charles Rd., Lewis Center, OH 43035"


To get a simple pivot table, we need a dataframe and an index.

In [105]:
# averages of all numeric columns by gender
pd.pivot_table(df, index=['gender'])
# shows us the averages of numeric columns based on the provided index

Unnamed: 0_level_0,age,exercise,grade,hours
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,16.568,3.047,82.7173,10.932
male,16.589,2.954,82.3948,11.045


In [106]:
# specifying just one numeric column
pd.pivot_table(df,values=["grade"],index=["gender"]) # just grade value

Unnamed: 0_level_0,grade
gender,Unnamed: 1_level_1
female,82.7173
male,82.3948


In [108]:
# minimum grade by gender
pd.pivot_table(df,values=["grade"],index=["gender"],aggfunc="min")

Unnamed: 0_level_0,grade
gender,Unnamed: 1_level_1
female,32.0
male,43.0


In [110]:
# finding minimum hours of study by gender
pd.pivot_table(df,values=["hours"],index=["gender"],aggfunc="min")

Unnamed: 0_level_0,hours
gender,Unnamed: 1_level_1
female,2
male,0


In [114]:
# can add other columns to the index as well
# viewing maximum grade categorized by age and gender
pd.pivot_table(df,values=["grade"],index=["gender","age"], aggfunc="max")

Unnamed: 0_level_0,Unnamed: 1_level_0,grade
gender,age,Unnamed: 2_level_1
female,14,100.0
female,15,100.0
female,16,100.0
female,17,100.0
female,18,100.0
female,19,100.0
male,14,100.0
male,15,100.0
male,16,100.0
male,17,100.0


In [116]:
# viewing maximum hours of study by age and gender
pd.pivot_table(df,values=["hours"],index=["gender","age"], aggfunc="max")

Unnamed: 0_level_0,Unnamed: 1_level_0,hours
gender,age,Unnamed: 2_level_1
female,14,20
female,15,20
female,16,19
female,17,20
female,18,20
female,19,20
male,14,19
male,15,20
male,16,20
male,17,20


In [119]:
# adding other columns to the values we are looking for
# average grade and hours by gender
pd.pivot_table(df,
              index=['gender'],
              values=['grade','hours'],
              aggfunc='mean')

Unnamed: 0_level_0,grade,hours
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,82.7173,10.932
male,82.3948,11.045


In [125]:
# creating pivot tables from subsets of data
# first select data
# then, do a standard pivot on that selection
# example:
df2 = df.loc[df.age==17] # select data, store in a new dataframe
pd.pivot_table(df2,
              values=['grade','hours'],
              index=['gender'],
              aggfunc='mean') # perform a standard pivot using the new df

Unnamed: 0_level_0,grade,hours
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,83.599435,10.943503
male,82.949721,11.268156


In [129]:
# include totals on Python pivot tables (allows for comparison between
# groups and overall)
df2 = df.loc[df.age==17]
pd.pivot_table(df2,
              values = ['grade','hours'],
              index=['gender'],
              aggfunc='mean',
              margins='True') # specify margins w/ boolean value True

Unnamed: 0_level_0,grade,hours
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,83.599435,10.943503
male,82.949721,11.268156
All,83.272753,11.106742


In [134]:
# Practice - create a pivot table showing the average grade by gender
# of people who had more than two hours of exercise per week
df2 = df.loc[df.exercise > 2]
pd.pivot_table(df2,
              index=['gender'],
              values = ['grade'],
              aggfunc='mean')

Unnamed: 0_level_0,grade
gender,Unnamed: 1_level_1
female,83.730343
male,83.622542


In [136]:
df[:,0]

TypeError: unhashable type: 'slice'