# **Regression**
* [Reading Data](#1)
* [Linear Regression](#3)
* [Multiple Linear Regression](#4)
* [Polynomial Linear Regression](#5)
* [Decision Tree Regression](#6)
* [Random Forest Regression](#7)
* [R Square with Random Forest Regression](#8)
* [R Square with Linear Regression](#9)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id="1"></a> <br>
# Reading Data

In [None]:
df1 = pd.read_csv("/kaggle/input/biomechanical-features-of-orthopedic-patients/column_2C_weka.csv")
df2 = pd.read_csv("/kaggle/input/biomechanical-features-of-orthopedic-patients/column_3C_weka.csv")


In [None]:
df1.head()

In [None]:
df1.info()

In [None]:
In 'column_2C_weka.csv'
* there are 7 features and for each of them 310 samples.
* Six feature are in float type and one of them is object.

In [None]:
df1.describe()

In [None]:
df2.head()

In [None]:
df2.info()

In 'column_3C_weka.csv'
* there are 7 features and for each of them 310 samples.
* Six feature are in float type and one of them is object.

In [None]:
df2.describe()

### **Correlation Between Features**

In [None]:
df1.corr()

In [None]:
sns.countplot(x="class", data=df1)
df1.loc[:,'class'].value_counts()

In [None]:
data = df1[df1['class'] =='Abnormal']
pelvic_incidence = np.array(data.loc[:,'pelvic_incidence']).reshape(-1,1)
sacral_slope = np.array(data.loc[:,'sacral_slope']).reshape(-1,1)
# Scatter
plt.figure(figsize=[10,10])
plt.scatter(pelvic_incidence,sacral_slope)
plt.xlabel('pelvic_incidence')
plt.ylabel('sacral_slope')
plt.show()

<a id="3"></a> <br>
# **Linear Regression**

* We can say also "line fit"
* **y = b0 + b1*x** where 
 * b0 = constant (*the point where line intersects the y axis*)
 * b1 = coefficient (*slope of the line*)
* The aim is to draw the line closest to the points. But the line may not pass exactly from the center of the dots. So there is the term "residual".
  * **residual = y - y_head**
      * y is where the point is
      * y_head is where it hits when drawn upright to the line from the point
* We can square the residual to reduce the error and get positive residual. By adding the squares of residual we can see how fit the line.
* Mean Squared Error  **MSE = (sum(residual^2))/n**
* The smaller value of the MSE, the better the line is fit.


In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
# Predict space
predict_space = np.linspace(min(pelvic_incidence), max(pelvic_incidence)).reshape(-1,1)
# Fit
reg.fit(pelvic_incidence,sacral_slope)
# Predict
predicted = reg.predict(predict_space)
# R^2 
print('R^2 score: ',reg.score(pelvic_incidence, sacral_slope))
# Plot regression line and scatter
plt.figure(figsize=(10,10))
plt.plot(predict_space, predicted, color='green', linewidth=2)
plt.scatter(pelvic_incidence,sacral_slope)
plt.xlabel('pelvic_incidence')
plt.ylabel('sacral_slope')
plt.show()

<a id="4"></a> <br>
# Multiple Linear Regression

* **y = b0 + b1*x1 + b2*x2**
* The aim is minimum MSE

In [None]:
x = (df1.iloc[:,[0,2]]).values # [pelvic_incidence,lumbar_lordosis_angle]
y = df1.sacral_slope.values.reshape(-1,1)

In [None]:
multiple_linear_regression = LinearRegression()
multiple_linear_regression.fit(x,y)


print("b0: ",multiple_linear_regression.intercept_)
print("b1,b2:",multiple_linear_regression.coef_)

multiple_linear_regression.predict(np.array([[63.0278175 , 39.60911701],[40.47523153, 39.60911701]]))

<a id="5"></a> <br>
# Polinomial Linear Regression

* **y = b0 + b1*x1 + b2*x2 + ... + bn*xn**

In [None]:
x = np.array(df1.loc[:,'pelvic_incidence']).reshape(-1,1)
y = np.array(df1.loc[:,'sacral_slope']).reshape(-1,1)

lr = LinearRegression()

lr.fit(x,y)
# predict
y_head = lr.predict(x)

plt.plot(x,y_head,color="purple",label = "linear")

# polynomial regression = y = b0 + b1*x + b2*x^2 + b3*x^3 + ... + bn*x^n

from sklearn.preprocessing import PolynomialFeatures

polynomial_regression = PolynomialFeatures(degree = 4) # takes polynomial until fourth degree

x_polynomial = polynomial_regression.fit_transform(x) #transform func makes x values polynomial
# fit
linear_regression2 = LinearRegression()
linear_regression2.fit(x_polynomial,y) 

# visualize

y_head2 = linear_regression2.predict(x_polynomial)


plt.plot(x,y_head2,color = "green", label = "poly")
plt.legend()
plt.show()

<a id="6"></a> <br>
# Decision Tree Regression

* **CART** : Classification and Regression Tree
* One of the most important things is "**split**" in decision tree regression.
* the areas that are splitted are called **terminal leaves**.

In [None]:
x = np.array(df1.loc[:,'pelvic_incidence']).reshape(-1,1)
y = np.array(df1.loc[:,'sacral_slope']).reshape(-1,1)


from sklearn.tree import DecisionTreeRegressor # random state = 0
tree_reg = DecisionTreeRegressor()
tree_reg.fit(x,y)


tree_reg.predict([[5.5]])

x_ = np.arange(min(x),max(x),0.01).reshape(-1,1)

y_head = tree_reg.predict(x_)
#%% visualize
plt.figure(figsize = (10,10))
plt.scatter(x,y,color = "red")
plt.plot(x_,y_head,color="green")
plt.xlabel("Pelvic Incidence")
plt.ylabel("Sacral Slope")
plt.show()

<a id="7"></a> <br>
# Random Forest Regression

* **random_state** allows selection of the same random values.
If we dont assign random state there would be different results every time the code executed.

In [None]:
x = np.array(df1.loc[:,'pelvic_incidence']).reshape(-1,1)
y = np.array(df1.loc[:,'sacral_slope'])

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 40,random_state = 42)

rf.fit(x,y)

x_ = np.arange(min(x),max(x),0.01).reshape(-1,1)
y_head = rf.predict(x_)

#visualize
plt.figure(figsize = (10,10))
plt.scatter(x,y,color="blue")
plt.plot(x_,y_head,color = "red")
plt.xlabel("Pelvic Incidence")
plt.ylabel("Sacral Slope")
plt.show()

<a id="8"></a> <br>
# R Square with Random Forest Regression

In [None]:
x = np.array(df1.loc[:,'pelvic_incidence']).reshape(-1,1)
y = np.array(df1.loc[:,'sacral_slope'])

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 100,random_state = 42)

rf.fit(x,y)

y_head = rf.predict(x)


from sklearn.metrics import r2_score

print("r_score", r2_score(y,y_head))

<a id="9"></a> <br>
# R Square with Linear Regression

In [None]:
x = np.array(df1.loc[:,'pelvic_incidence']).reshape(-1,1)
y = np.array(df1.loc[:,'sacral_slope'])

plt.figure(figsize=(10,10))
plt.scatter(pelvic_incidence,sacral_slope)
plt.xlabel("Pelvic Incidence")
plt.ylabel("Sacral Slope")

from sklearn.linear_model import LinearRegression

#linear regression model
linear_reg = LinearRegression()


linear_reg.fit(x,y)

y_head = linear_reg.predict(x)
plt.plot(x, y_head , color = "red")

from sklearn.metrics import r2_score

print("r_square score: ", r2_score(y, y_head))