In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()

If you want to see more in depth thoughts on this analysis, please visit my blog: https://datasciencerecruit.com/

In [None]:
data = pd.read_csv('../input/satandgpa-lr/SATandGPA_LinearRegression.csv')

In [None]:
data.info()

In [None]:
data.head(10) ##Checking if everything is allright

In [None]:
x = data['SAT']
y = data['GPA']
plt.scatter(x,y)                     ## Plotting the points
plt.xlabel('SAT', fontsize=20)       ## Defining the x-label name
plt.ylabel('GPA', fontsize=20)       ## Defining the y-label name
plt.show()  

In [None]:
x1 = sm.add_constant(x) ##Discovering the interception on the graph
model = sm.OLS(y,x1) ## Creating the model
results = model.fit() ##Fitting the model into our data
results.summary() ##Check table with statistical info

In [None]:
plt.scatter(x,y)                     ## Plotting the points
y_new = 0.0017*x + 0.2750           ## Defining the equation and 
plt.plot(x,y_new, 'r--',lw=3, label='regression line') ## Plotting the line, x is the same for both,
                                                       ## but y_new is the lines predicted by the equation
plt.xlabel('SAT', fontsize=20)       ## Defining the x-label name
plt.ylabel('GPA', fontsize=20)       ## Defining the y-label name
plt.show()  

------------------------------------------------------------------------------------------------------------------

## REGRESSION WITH SKLEARN

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()
x = np.array(x).reshape(-1,1)
y = np.array(y).reshape(-1,1)
results = model.fit(x,y)

In [None]:
results.coef_

In [None]:
results.intercept_

In [None]:
results.score(x,y)

In [None]:
plt.scatter(x,y)                     ## Plotting the points
y_new = 0.00165569*x + 0.2750403            ## Defining the equation and 
plt.plot(x,y_new, 'r--',lw=3, label='regression line') ## Plotting the line, x is the same for both,
                                                       ## but y_new is the lines predicted by the equation
plt.xlabel('SAT', fontsize=20)       ## Defining the x-label name
plt.ylabel('GPA', fontsize=20)       ## Defining the y-label name
plt.show()                           ## Showing the plot, this line will remove, unnecessary text at the top of the graph  

------------------------------------------------------------------------------------------------------------------------

## Regression with Attendance

In [None]:
data = pd.read_csv('../input/103-dummiescsv/1.03. Dummies.csv')

In [None]:
data.head(10) ##Checking if everything is allright

In [None]:
data.info()

In [None]:
data['Attendance'] = data['Attendance'].map({'Yes': 1, 'No': 0}) ##This will convert our data into 1's and 0's, 
                                                                 ## as yes and no's have no meaning in a regression
data.info()

In [None]:
x = data[['SAT', 'Attendance']]
y = data['GPA']

------------------------------------------------------------------------------------------------------------------

## WITH STATSMODEL

In [None]:
x1 = sm.add_constant(x) ##Discovering the interception on the graph
model = sm.OLS(y,x1) ## Creating the model
results = model.fit() ##Fitting the model into our data
results.summary() ##Check table with statistical info

In [None]:
plt.figure(figsize=(10,8))
sns.scatterplot(x=data['SAT'],y=data['GPA'],hue=data['Attendance'])                     ## Plotting the points
y_no = 0.0014*data['SAT'] + 0.6439   ## Defining the equation and 
y_yes = 0.0014*data['SAT'] + 0.8665
sns.lineplot(x=data['SAT'],y=y_no, lw=3, label='<75%')  ## Plotting the line, x is the same for both,
sns.lineplot(x=data['SAT'],y=y_yes, lw=3,  label='>75%') ## but y_new is the lines predicted by the equation
plt.xlabel('SAT', fontsize=20)       ## Defining the x-label name
plt.ylabel('GPA', fontsize=20)       ## Defining the y-label name
plt.show()                           ## Showing the plot, this line will remove, unnecessary text at the top of the graph  

**We can see that the SAT is a good indicator of the GPA of an student after graduation, but we can see that out R^2 is a bit lower, so we may need more variables to perform a better regression**

**Further variables will be added to this dataset to perform a multi variable linear regression**