In [8]:
## Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [10]:
# Load the dataset
df=pd.read_csv('data\StudentsPerformance.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [14]:
Y=df['math score']
Y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math score, Length: 1000, dtype: int64

In [16]:
X=df.drop(columns=['math score'],axis=1)
X

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,95
996,male,group C,high school,free/reduced,none,55,55
997,female,group C,high school,free/reduced,completed,71,65
998,female,group D,some college,standard,completed,78,77


In [23]:
##
num_features = X.select_dtypes(exclude='object').columns
num_features

Index(['reading score', 'writing score'], dtype='object')

In [25]:
col_features = X.select_dtypes(include='object').columns
col_features

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course'],
      dtype='object')

In [27]:
encoder = OneHotEncoder()
scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num',scaler,num_features),
        ('cat',encoder,col_features)
    ]
)

In [28]:
X=preprocessor.fit_transform(X)

In [29]:
X.shape

(1000, 19)

In [31]:
X

array([[ 0.19399858,  0.39149181,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       [ 1.42747598,  1.31326868,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.77010859,  1.64247471,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [ 0.12547206, -0.20107904,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.60515772,  0.58901542,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.15336989,  1.18158627,  1.        , ...,  0.        ,
         0.        ,  1.        ]])

In [32]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [None]:
models = {
    'LinearRegression' : LinearRegression(),
    'RandomForestRegressor' : RandomForestRegressor(),
    'SVR' : SVR(),
    'DecisionTreeRegressor' : DecisionTreeRegressor(),
    'XGBRegressor' :XGBRegressor(),
    }

for model in list(models.values()):
    model.fit(X_train,Y_train)
    y_pred = model.predict(X_test)

    print('-------------------------------------')
    print('The model is:',model)
    print('The r2score is:',r2_score(Y_test,y_pred))
    print('The absolute mean error is:', mean_absolute_error(Y_test,y_pred))
    print('The mean squared error is:', mean_squared_error(Y_test,y_pred))

   print('-------------------------------------')

-------------------------------------
The model is: LinearRegression()
The r2score is: 0.8804332983749565
The absolute mean error is: 4.21476314247485
The mean squared error is: 29.095169866715473
-------------------------------------
-------------------------------------
The model is: RandomForestRegressor()
The r2score is: 0.8555085225736851
The absolute mean error is: 4.5641750000000005
The mean squared error is: 35.16032493055556
-------------------------------------
-------------------------------------
The model is: SVR()
The r2score is: 0.7286001513223705
The absolute mean error is: 5.401539244496997
The mean squared error is: 66.04200493745648
-------------------------------------
-------------------------------------
The model is: DecisionTreeRegressor()
The r2score is: 0.7337658174788689
The absolute mean error is: 6.355
The mean squared error is: 64.785
-------------------------------------
-------------------------------------
The model is: XGBRegressor(base_score=None, boo

In [40]:
def reverse_string(s):
    return s[::-1]

reverse_string("hello")



'olleh'

In [None]:
def return_unique(num_lst):
    return set(num_lst)
    
    

func([1,2,3,4,2,5,2,1,4])

{1, 2, 3, 4, 5}

In [46]:

def word_counter(example):
    words = example.split()
    word_count = {}
    for word in words:
        if word in word_count:
            word_count[word] +=1
        else:
            word_count[word]=1
    return word_count


example = 'The red fox The black fox The white fox'

 


In [47]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [None]:
df.groupby('gender').mean()