# **A  Machine Learning technique of minimizing ring blow holes**

# **Import Important Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# **Loading the Data Set**

In [None]:
df= pd.read_csv('../input/furnace-data/Casting_Data.csv') 
df.head(3)

In [None]:
df.tail(3)

# **EDA EXPLORATORY DATA ANALYSIS**

In [None]:
df.shape

In [None]:
df.nunique()

In [None]:
df.duplicated().sum()

In [None]:
# Drop Duplicates
df.drop_duplicates(inplace=True)
df

In [None]:
# Cross Check Duplicated Values
df.duplicated().sum()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
num_features = df.select_dtypes(include=np.number)

In [None]:
skewness = num_features.apply(lambda x:x.skew()).sort_values(ascending=True) #to check skewness
skewness

** insight
RB2, RL2, percentage scrap is highly skewed

In [None]:
kurtosis= num_features.apply(lambda x:x.kurt()).sort_values(ascending = True) #to check outliers in dataset
kurtosis

In [None]:
# RB2, RL2, percentage scrap has outliers

In [None]:
corr=df.corr()["PERCENTAGE_SCRAP"]
corr[np.argsort(corr, axis=0)[::-1]]

# **VISUALISATION USING MATPLOTLIB AND SEABORN**

# **Correlation**

In [None]:
corr=df.corr()
plt.figure(figsize=(20,7))
sns.heatmap(corr,annot=True, cmap='coolwarm')

# **Pairplot**

In [None]:
sns.pairplot(df)

# **Data VISUALISATION**

In [None]:
plt.figure(figsize=(20,7))
plt.scatter('TEMP', 'PERCENTAGE_SCRAP', data=df, c='r')
plt.xlabel('TEMPRATURE', fontsize='large')
plt.ylabel('SCRAP PERCENTAGE', fontsize='large');

In [None]:
sns.distplot(df['PERCENTAGE_SCRAP'], color="r", kde=False)
plt.title("Distribution of Scrap")
plt.ylabel("Number of Occurences")
plt.xlabel("Percentage scrap");

In [None]:
plt.figure(figsize=(20,7))
sns.boxplot(x="TEMP", y="PERCENTAGE_SCRAP", data=df)

In [None]:
plt.figure(figsize=(20,7))
sns.boxplot(x="COPPER_CONTENT", y="PERCENTAGE_SCRAP", data=df)

In [None]:
plt.figure(figsize=(20,7))
sns.boxplot(x="NICKEL_CONTENT", y="PERCENTAGE_SCRAP", data=df)

In [None]:
plt.figure(figsize=(20,7))
sns.boxplot(x="MAGNESIUM_CONTENT", y="PERCENTAGE_SCRAP", data=df)

# **MISSING VALUE TREATMENT**

In [None]:
df.isnull().sum()

In [None]:
df['PERCENTAGE_SCRAP'].fillna(df['PERCENTAGE_SCRAP'].mean(),inplace= True)

In [None]:
df.isnull().sum()

# **CREATING TRAINING SET and VALIDATION SET USING SKLEARN**

In [None]:
X = df.drop(['PERCENTAGE_SCRAP', 'SCRAP'],axis=1).values
y= df['PERCENTAGE_SCRAP'].values
#X.head(3)

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=7) #20% is for validation

print(X_train.shape,X_test.shape)

# **Different MACHINE LEARNING MODELS USING SKLEARN**

# **LINEAR REGRESSION MODEL**

In [None]:
# baseline model
from sklearn.linear_model import LinearRegression #importing lin reg model from sklearn package 

reg = LinearRegression() # initialize the model
reg.fit(X_train,y_train) # fitting the model

pred = reg.predict(X_test) #making the prdictions

from sklearn.metrics import r2_score # rsquare value

score1= r2_score(y_test,pred)
print(score1)


In [None]:
reg.get_params()  

In [None]:
reg.intercept_      # this intercept_ will provide us the value of B0 = Intercept

In [None]:
reg.coef_      # this coef_ will provide us the value of B1 = Slope

In [None]:
reg.score(X_test, y_test)    # Accuracy on test data

In [None]:
reg.score(X_train, y_train)    # Accuracy on train data

In [None]:
result = reg.predict(X_test)
result

In [None]:
from sklearn.model_selection import train_test_split
x1,x2,y1,y2 = train_test_split(X,y,test_size = 0.3)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x1,y1)

In [None]:
y_pred = model.predict(x2)

# **R-Squared Score (Coefficient of Determination)**

In [None]:
model.score(x2,y2)

# **Mean Absolute Error**

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_pred,y2 )

# **Mean Squared Error**

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_pred,y2, squared = False)

# **Cross Validation Score**

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(model,X,y,cv = 5)

# **Polynomial Features**

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train_poly,y_train)

In [None]:
reg.score(X_train_poly,y_train)

In [None]:
reg.score(X_test_poly,y_test)

# **Ridge Regression**

In [None]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha = 1)
ridge_reg.fit(X_train_poly, y_train)

In [None]:
ridge_reg.score(X_train_poly,y_train)

In [None]:
ridge_reg.score(X_test_poly,y_test)

# **Lasso regression**

In [None]:
from sklearn.linear_model import Lasso
la_reg = Lasso()
la_reg.fit(X_train_poly, y_train)

In [None]:
la_reg.score(X_train_poly,y_train)

In [None]:
la_reg.score(X_test_poly,y_test)

# **DECISION TREE REGRESSION MODEL**

In [None]:
from sklearn.tree import DecisionTreeRegressor #importing pakage

dec_tree_model = DecisionTreeRegressor(random_state=7) # initiating model with any hyperparameter tuning

dec_tree_model.fit(X_train,y_train)  #fitting model

y_pred_2 = dec_tree_model.predict(X_test) #making predictions on valdation set
score2= r2_score(y_test,y_pred_2)
print(score2)

In [None]:
dec_tree_model.score(X_train,y_train)

In [None]:
dec_tree_model.score(X_test,y_test)

# **Hyper Parameter Tunning 1**

In [None]:
from sklearn.tree import DecisionTreeRegressor
dec_tree_model = DecisionTreeRegressor(max_depth = 5, random_state = 7)
dec_tree_model.fit(X_train,y_train)
y_pred_2 = dec_tree_model.predict(X_test) #making predictions on valdation set
score2= r2_score(y_test,y_pred_2)
print(score2)

In [None]:
dec_tree_model.score(X_train,y_train)

In [None]:
dec_tree_model.score(X_test,y_test)

# **Hyper Parameter Tunning 2**

In [None]:
from sklearn.tree import DecisionTreeRegressor
dec_tree_model = DecisionTreeRegressor(max_depth = 10, random_state = 7)
dec_tree_model.fit(X_train,y_train)
y_pred_2 = dec_tree_model.predict(X_test) #making predictions on valdation set
score2= r2_score(y_test,y_pred_2)
print(score2)

In [None]:
dec_tree_model.score(X_train,y_train)

In [None]:
dec_tree_model.score(X_test,y_test)

# **RANDOM FOREST MODEL**

In [None]:
from sklearn.ensemble import RandomForestRegressor

random_forest_model = RandomForestRegressor(random_state=7)

random_forest_model.fit(X_train,y_train)

y_pred_3 = random_forest_model.predict(X_test)

score3= r2_score(y_test,y_pred_3)
print(score3)

In [None]:
random_forest_model.score(X_train,y_train)

In [None]:
random_forest_model.score(X_test,y_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor

random_forest_model = RandomForestRegressor(n_estimators = 3, random_state = 0)

random_forest_model.fit(X_train,y_train)

y_pred_3 = random_forest_model.predict(X_test)

score3= r2_score(y_test,y_pred_3)
print(score3)

In [None]:
random_forest_model.score(X_train,y_train)

# **K-fold cross validation**

importing necessary packages

In [None]:
#using Decision tree 10 fold cv
from sklearn.model_selection import KFold
model=DecisionTreeRegressor()
kfold_validation=KFold(10)

import numpy as np
from sklearn.model_selection import cross_val_score
results=cross_val_score(model,X,y,cv=kfold_validation)
print(results)
print(np.mean(results))

In [None]:
#shuffle split k fold cv using random forest

from sklearn.model_selection import ShuffleSplit
model=RandomForestRegressor()

ssplit=ShuffleSplit(n_splits=10,test_size=0.30)
results=cross_val_score(model,X,y,cv=ssplit)

In [None]:
results

# **Load Data Set Again**

In [None]:
df.head()

In [None]:
cols = ["INSPECTED","RL2","TEMP","TILT_ANGLE","SILICON_CONTENT","COPPER_CONTENT","NICKEL_CONTENT","MAGNESIUM_CONTENT"]
X = df[cols].values
Y = df[["SCRAP"]].values

In [None]:
print(X.shape,Y.shape)

# **Label Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
X[:,1] = enc.fit_transform(X[:,1])
X[:,2] = enc.fit_transform(X[:,2])
X[:,3] = enc.fit_transform(X[:,3])

# **Train Test Split**

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 14)

# **Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 14)
dt_classifier.fit(x_train,y_train)

In [None]:
pred_train = dt_classifier.predict(x_train)
pred_test = dt_classifier.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score
print("Training Accuracy :", accuracy_score(pred_train,y_train))
print("Testing Accuracy :", accuracy_score(pred_test,y_test))

# **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators = 8, criterion = 'entropy', random_state =0)
rf_classifier.fit(x_train, y_train)

In [None]:
pred_train = rf_classifier.predict(x_train)
pred_test = rf_classifier.predict(x_test)

In [None]:
print("Training Accuracy :", accuracy_score(pred_train,y_train))
print("Testing Accuracy :", accuracy_score(pred_test,y_test))

# **SVC = Support Vector Classifier**

In [None]:
from sklearn.svm import SVC
svm_classifier = SVC(kernel = 'linear' , random_state = 0)
svm_classifier.fit(x_train, y_train)

In [None]:
pred_train = svm_classifier.predict(x_train)
pred_test = svm_classifier.predict(x_test)

In [None]:
print("Training Accuracy :", accuracy_score(pred_train,y_train))
print("Testing Accuracy :", accuracy_score(pred_test,y_test))