Simple K-means clustering on the Iris dataset

In [None]:
# DM Project

In [None]:
#importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#importing the Iris dataset with pandas
dataset = pd.read_csv('../input/datamining/DM_Project.csv')
dataset = dataset.rename(columns={'Gross Margin': 'Gross_Margin', 'Job Type':'Job_Type'})
print(dataset)

In [None]:
#data cleaning
dataset.info()
dataset = dataset.dropna()
dataset.info()

In [None]:
import seaborn as sns
x = dataset[['Job_Type','Zip_','Income','Marketing Expense / order','Dummy_Recall','Dummy_Member','Total Technician Paid Time','Dummy_Estimate','Gross_Margin']]
print(x)
print(x.shape)
sns.pairplot(x,hue = 'Zip_')
sns.pairplot(x,hue = 'Job_Type')

In [None]:
#column selection
x = dataset.iloc[:, [ 3, 4, 7, 9, 10, 12, 13]].values
#x = dataset[['Income','Marketing Expense / order','Dummy_Recall','Dummy_Member','Total Technician Paid Time','Dummy_Estimate','Gross Margin']]
x

In [None]:
#scaling dataset
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x)
x

In [None]:
#Finding the optimum number of clusters for k-means classification
from sklearn.cluster import KMeans
wcss = []

for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(x)
    wcss.append(kmeans.inertia_)
    
#Plotting the results onto a line graph, allowing us to observe 'The elbow'
f = plt.figure()
f.set_figwidth(16)
f.set_figheight(9)
plt.plot(range(1, 11), wcss)
plt.title('The elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') #within cluster sum of squares
plt.show()

In [None]:
#Applying kmeans to the dataset / Creating the kmeans classifier
kmeans = KMeans(n_clusters = 4, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_kmeans = kmeans.fit_predict(x)
print(y_kmeans.shape)
print(len(y_kmeans))
y_kmeans

In [None]:

plt.figure(figsize=(20, 20))
#Visualising the clusters
plt.scatter(x[y_kmeans == 0, 0], x[y_kmeans == 0, 1], s = 5, c = 'red', label = 'Cluster1')
plt.scatter(x[y_kmeans == 1, 0], x[y_kmeans == 1, 1], s = 5, c = 'blue', label = 'Cluster2')
plt.scatter(x[y_kmeans == 2, 0], x[y_kmeans == 2, 1], s = 5, c = 'green', label = 'Cluster3')
plt.scatter(x[y_kmeans == 3, 0], x[y_kmeans == 3, 1], s = 5, c = 'black', label = 'Cluster4')

#Plotting the centroids of the clusters
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1], s = 30, c = 'yellow', label = 'Centroids')

plt.legend()


In [None]:
#Adding Clusters to DataFrame
dataset['y_cluster'] = y_kmeans
dataset

In [None]:
dataset= dataset.drop(['Location Zip', 'Completion Date', 'Recall','Member Status','Estimate Accepted Online', 'Per Hr charge' ], axis=1)
dataset

In [None]:
zipcode = pd.get_dummies(dataset.Zip_, drop_first=True)
jobtype = pd.get_dummies(dataset.Job_Type, drop_first=True)
clusters = pd.get_dummies(dataset.y_cluster, drop_first=True, prefix="Cluster")

merged = pd.concat([dataset,zipcode,jobtype,clusters],axis=1)
final = merged.drop(['Job_Type','Zip_','y_cluster'],axis ='columns')
final.info()

In [None]:
#Feature selection with heatmap on One hot Encoded data
#get correlations of each features in final
corrmat = final.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(final[top_corr_features].corr(),annot=True,cmap="RdYlGn")


In [None]:
#Regression on Final Dataset
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import datasets


# Input Data
x = final.drop('Gross_Margin',axis='columns')
# Output Data
y = final.Gross_Margin

X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=.2, random_state=0)
model = LinearRegression()
model.fit(X_train, y_train)


In [None]:
y_predicted = model.predict(X_test)
print(model.coef_)
print(model.intercept_)

In [None]:
#Actual vs Predicted graph
plt.scatter(y_predicted, y_test, edgecolors=(0, 0, 1))
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=3)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()



In [None]:
# model evaluation for testing set
from sklearn import metrics
from sklearn import datasets
mae = metrics.mean_absolute_error(y_test, y_predicted)
mse = metrics.mean_squared_error(y_test, y_predicted)
r2 = metrics.r2_score(y_test, y_predicted)



print("The model performance for testing set")
print("--------------------------------------")
print('MAE is {}'.format(mae))
print('MSE is {}'.format(mse))
print('R2 score is {}'.format(r2))

We have very high R2 which signifies overfitting.

In [None]:
#display adjusted R-squared
import statsmodels.api as sm
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
print(model.rsquared_adj)

In [None]:

import statsmodels.api as sm
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

model = sm.OLS(y_train,X_train)

results = model.fit()
results.params

In [None]:
print(results.summary())

##### We have very high adjusted R2 which signifies overfitting.
We will be reducing the features based on correlation matrix.

# F- Test
* If your Linear Regression model fit well then R-squared valued would be closer to 1.
* Adjusted R-squared will penalies R-square value if you will keep on adding unecessary fetuares for building your model. 
* If Adjusted R-squared is much lesser than R-squared it's a sign that you are using a feature which has very lesser impact on the target.
* F-Statistic or F-test is used to access the significance of overall Regression model.

**It compares the existing model with multiple feature with Intercept only model(without feature). The Null hypothesis is that these 2 models are equal.**
Whereas alternate Hypothesis is that Intercept only model is worse than our model.
We will get back a P-value(Prob (F-statistic)) and F-statistics value for whether to accept or reject the Null hypothesis.
### If P-value(Prob (F-statistic)) < 0.05 and F-statistics > 1 or high indicates that good relationship amoung the target and features.

# T-Test
* T-test will take into account one feature at a time.
* The Null hypothesis in this case is feature coefficient is equal to 0. And Alternate hypothesis is that feature coefficient not equal 0.

* if  P>|t| value is 0 or near to 0 , it means you reject the Null hypothesis and accept the Alternate hypothesis.

**Omnibus tests** are a kind of statistical test. They test whether the explained variance in a set of data is significantly greater than the unexplained variance, overall.


**AIC and BIC** differ by the way they penalize the number of parameters of a model. More precisely, BIC criterion will induce a higher penalization for models with an intricate parametrization in comparison with AIC criterion.

**Log-Likelihood**
* Coefficients of a linear regression model can be estimated using a negative log-likelihood function from maximum likelihood estimation.
* The negative log-likelihood function can be used to derive the least squares solution to linear regression.

## Dublin Watson Test

The Durbin Watson Test is a measure of **autocorrelation (also called serial correlation)** in residuals from regression analysis. Autocorrelation is the similarity of a time series over successive time intervals. 

* It can lead to underestimates of the standard error and can cause you to think predictors are significant when they are not. The Durbin Watson test looks for a specific type of serial correlation, the AR(1) process.

The Hypotheses for the Durbin Watson test are:
* H0 = no first order autocorrelation.
* H1 = first order correlation exists.

## Jarque-Bera Test

The Jarque-Bera Test, is a test for normality. 
Specifically, the test matches the skewness and kurtosis of data to see if it matches a normal distribution. The data could take many forms, including:

* Time Series Data.
* Errors in a regression model.
* Data in a Vector.


**A normal distribution has a skew of zero (i.e. it’s perfectly symmetrical around the mean) and a kurtosis of three; kurtosis tells you how much data is in the tails and gives you an idea about how “peaked” the distribution is. It’s not necessary to know the mean or the standard deviation for the data in order to run the test.**

In [None]:
##Feature selection with heatmap on non Encoded data
#get correlations of each features in final
corrmat = dataset.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(dataset[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
dataset.info()

In [None]:
#dropping cols
dataset.drop(['Zip_', 'Job_Type', 'y_cluster','Dummy_Member'], axis = 1,inplace=True)
dataset

In [None]:
#Regression on Dataset with feature selection

# Input Data
x = dataset.drop('Gross_Margin',axis='columns')
# Output Data
y = dataset.Gross_Margin

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42)

import statsmodels.api as sm
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

model = sm.OLS(y_train,X_train)

results = model.fit()
results.params

In [None]:
print(results.summary())

In [None]:
y_predicted = results.predict(X_test)

In [None]:
#Actual vs Predicted graph
#plt.figure(figsize=(20, 20))
plt.scatter(y_predicted, y_test, edgecolors=(0, 0, 1))
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=3)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# model evaluation for testing set
from sklearn import metrics
from sklearn import datasets
mae = metrics.mean_absolute_error(y_test, y_predicted)
mse = metrics.mean_squared_error(y_test, y_predicted)
r2 = metrics.r2_score(y_test, y_predicted)

print("The model performance for testing set")
print("--------------------------------------")
print('MAE is {}'.format(mae))
print('MSE is {}'.format(mse))
print('R2 score is {}'.format(r2))

We have very high R2 which signifies overfitting.

In [None]:
#display adjusted R-squared
import statsmodels.api as sm
x = sm.add_constant(X)
model = sm.OLS(y, x).fit()
print(model.rsquared_adj)

##### We have very high adjusted R2 which signifies overfitting.
We will be reducing the features based on correlation matrix.

# Backup Code

In [None]:
#Backup Code