In [3]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
from sklearn.model_selection import cross_val_score

#importing datset
dataset = pd.read_csv("../input/2015.csv")

#Plotting the correlation using matshow() and heatmap

c = dataset.corr()
plt.matshow(c)
plt.xticks(range(len(c.columns)), c.columns)
plt.yticks(range(len(c.columns)), c.columns)

sns.heatmap(c)

# Calculating the average happiness scores region wise and plotting them

s = dataset.groupby(['Region'])[ 'Happiness Score'].mean()

df = pd.DataFrame(s)
df.index is s.index
df['Region'] = df.index
#s.reindex(index = [0,1,2,3,4,5,6,7,8,9])

sns.stripplot(x="Region", y="Happiness Score", data=df, jitter=True)

# Plotting the happiness scores on the world map

data = dict(type = 'choropleth', 
           locations = dataset['Country'],
           locationmode = 'country names',
           z = dataset['Happiness Score'], 
           text = dataset['Country'],
           colorbar = {'title':'Happiness'})
layout = dict(title = 'Global Happiness', 
             geo = dict(showframe = False, 
                       projection = {'type': 'Mercator'}))
choromap3 = go.Figure(data = [data], layout=layout)
plot(choromap3)

# K-MEANS CLUSTERING

X = dataset.iloc[:, 4:12].values

Y = dataset.iloc[:, 3].values
    
#K-MEANS CLUSTERING
# Using the Elbow Method to find the optimal number of clusters

wcss = []
for i in range(1, 11):
    km = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    km.fit(X)
    wcss.append(km.inertia_)
plt.plot(range(1,11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

# Applying K-Means to the dataset

km = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
n_km = km.fit_predict(X)

#Plotting the CLustering results

pca_2 = PCA(2)
plot_columns = pca_2.fit_transform(X)
plt.scatter(x = plot_columns[:,0], y = plot_columns[:,1], c = km.labels_)
plt.xlabel("Canonical Variable 1")
plt.ylabel("Canonical Variable 2")
plt.title("Scatterplot for Clustering")
plt.show()

#Splitting the dataset into training set and test set

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

#Feature Scaling

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

#Fitting MULTIPLE LINEAR REGRESSION to the training set

regressor = LinearRegression()
regressor.fit(X_train, Y_train)

#Predicting the test set results

y_pred = regressor.predict(X_test)

#Plotting the predictions against original values

plt.scatter(Y_test,y_pred)
plt.xlabel("Original Value")
plt.ylabel("Predicted Value")
plt.show()

print('Coefficients: \n', regressor.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((regressor.predict(X_test) - Y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regressor.score(X_test, Y_test))

# Applying k-Fold Cross Validation

accuracies = cross_val_score(estimator = regressor, X = X_train, y = Y_train, cv = 10)
accuracies.mean()
accuracies.std()