# Regression Tree: Continuos quantitative target variable : Predicting rainfall, marks, revenue,etc

# Regression classifier : Discrete categorical variables : Predicting high or low, win or loss, healthy or unhealthy.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df=pd.read_csv('../input/others/Movie_regression.xls',header=0) #since our csv file has header at 0th row, we use header=o
df.head(10)

# We want to predict values for Collection (Y)

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df['Genre'].unique()

In [None]:
df.info()

# Time taken has missing values, look at the table, there are only 494 values

# Misssing Values Imputation

In [None]:
df['Time_taken'].mean()

In [None]:
df['Time_taken'].fillna(value=df['Time_taken'].mean(),inplace=True) 
#we have filled the missing values with mean values

In [None]:
df.info()

# Dummy variable creation

We have to convert all our categorical variables, into numerical variables.
And we do that by Dummy variable

In [None]:
df.head()

In [None]:
# 3d and Genre are categorical 
# we will convert them into dummy variable


In [None]:
df=pd.get_dummies(df,columns=['3D_available','Genre'],drop_first=True) #drop_first = n-1 , 
df.head()

In [None]:
df.shape

# X-y split

In [None]:
X=df.loc[:,df.columns!='Collection']
type(X)

In [None]:
X.head()

In [None]:
X.shape

In [None]:
y=df['Collection']
type(y)

In [None]:
y.head()

# Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
# since we are randomly assigning our data into test and train , to get the same test data everytime, so that 
#we can compare the performmace of the data
#if i keep random state the same, we will get the same train test split

In [None]:
X_train.head() #indexes are shuffled, 

In [None]:
X_train.shape

In [None]:
X_test.shape

# Training Regression Tree

In [None]:
from sklearn import tree
regtree=tree.DecisionTreeRegressor(max_depth=3)
# max depth = no of layers in our tree, we dont want to overfit, we use 3 . 
# Don't exceed beyond 5

In [None]:
regtree.fit(X_train,y_train)

#  Predict values using trained model

In [None]:
y_train_pred=regtree.predict(X_train)
y_test_pred=regtree.predict(X_test)

In [None]:
y_test_pred

# Model performance = mean squared error, r2 value(goodness value= lies between 0(no fit) and 1 (perfect fit))

# 0.4 to 0.8 for good models and above foe excellent models => r2 values

In [None]:
from sklearn.metrics import mean_squared_error,r2_score

In [None]:
mean_squared_error(y_test,y_test_pred)
# here we give test values and predicted values for y

In [None]:
r2_score(y_train,y_train_pred)
# the value obtained is 0.83 which means our model is performing great

In [None]:
# calculate r2 values on our test data
r2_score(y_test,y_test_pred)

#always look at your test r2 values to evaluate your model performance

# Plotting Decision Tree

1. firstly we create a dot file
2. convert dot file into an image.
3. Then use that image to creat a graph.

In [None]:
dot_data=tree.export_graphviz(regtree, out_file=None)

In [None]:
from IPython.display import Image

In [None]:
import pydotplus

In [None]:
graph=pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

# Pruning a tree= cutting the parts of the tree which are not beneficial for us

# Pre-Prunning = controlling tree growth

# Max number of Levels in tree

In [None]:
regtree1=tree.DecisionTreeRegressor(max_depth=3)
regtree1.fit(X_train,y_train)
dot_data=tree.export_graphviz(regtree1, out_file=None,feature_names=X_train.columns,filled=True) #filled = it will fill colors as per the conditon for the target variable = collection
graph1=pydotplus.graph_from_dot_data(dot_data)
Image(graph1.create_png())


# Minimum observations at internal node

In [None]:
regtree2=tree.DecisionTreeRegressor(min_samples_split=40)
regtree2.fit(X_train,y_train)
dot_data=tree.export_graphviz(regtree2, out_file=None,feature_names=X_train.columns,filled=True) 
graph2=pydotplus.graph_from_dot_data(dot_data)
Image(graph2.create_png())

# Minimum observations at leaf node

In [None]:
regtree3=tree.DecisionTreeRegressor(min_samples_leaf=25)
regtree3.fit(X_train,y_train)
dot_data=tree.export_graphviz(regtree3, out_file=None,feature_names=X_train.columns,filled=True) 
graph3=pydotplus.graph_from_dot_data(dot_data)
Image(graph3.create_png())

# we can also add conidtions to the above graph, by limiting the number of layers using max_depth

In [None]:
regtree3=tree.DecisionTreeRegressor(min_samples_leaf=25,max_depth=4)
regtree3.fit(X_train,y_train)
dot_data=tree.export_graphviz(regtree3, out_file=None,feature_names=X_train.columns,filled=True) 
graph3=pydotplus.graph_from_dot_data(dot_data)
Image(graph3.create_png())