In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(15,10)})

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df1 = pd.read_csv('/kaggle/input/ontario/ontario-public-sector-salary-2018.csv', nrows = 100000)
df2 = pd.read_csv('/kaggle/input/ontario/ontario-public-sector-salary-2019.csv', nrows = 100000)

In [None]:
df = pd.concat([df1,df2], axis = 0)

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.describe(include = "all")

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.drop(['Last Name', 'First Name'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
def fuc(v):
    
    v = v[2:]
    v = v.replace("," ,"")
    return (float(v))

In [None]:
df['Salary Paid'] = df['Salary Paid'].apply(fuc)
df['Taxable Benefits'] = df['Taxable Benefits'].apply(fuc)

In [None]:
df['Sector'] = df['Sector'].replace("Colleges\xa0","Colleges")

In [None]:
df.Sector.value_counts()

In [None]:
df.dtypes

In [None]:
df['Job Title'].value_counts()[:10]

In [None]:

# Create a circle for the center of the plot
my_circle=plt.Circle( (0,0), 0.7, color='white')
plt.pie(df['Job Title'].value_counts()[:10].values, labels = df['Job Title'].value_counts()[:10].index)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.show()

In [None]:
#Add title
#plt.title("Salary Paid for sectors by Government in month 2019")

# Bar chart showing average arrival delay for Spirit Airlines flights by month
plot_order = df.groupby('Sector')['Salary Paid'].sum().sort_values(ascending=True).index.values
sns.catplot(x = 'Sector', y='Salary Paid', data = df, kind = 'bar', aspect = 2, height = 6)

plt.xticks(rotation=90)
# Add label for vertical axis
plt.ylabel("Salary Paid by Government")
plt.show()

In [None]:
print(plot_order)

In [None]:
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
iplot([go.Histogram2dContour(x=df.head(500)['Job Title'], 
                             y=df.head(500)['Salary Paid'], 
                             contours=go.Contours(coloring='heatmap')),
       go.Scatter(x=df.head(100)['Job Title'], y=df.head(100)['Salary Paid'], mode='markers')])

In [None]:
df1 = df.assign(n=0).groupby(['Job Title', 'Salary Paid'])['n'].count().reset_index()
df1 = df1[df1["Salary Paid"] < 100]
v = df1.pivot(index='Salary Paid', columns='Job Title', values='n').fillna(0).values.tolist()
iplot([go.Surface(z=v)])

In [None]:
df.head()


In [None]:
import plotly.graph_objects as go
fig = go.Figure(data=[go.Scatter3d(
    x=df['Salary Paid'],
    y=df['Taxable Benefits'],
    z=df['Calendar Year'],
    name = 'ontario salary analysis',
    mode='markers',
    marker=dict(
        size=10,
        color = df['Calendar Year'],
        colorscale = 'Viridis',
    )
)])
fig.show()

In [None]:
df['Job Title'].value_counts().head(10).plot.pie()

In [None]:
from sklearn import preprocessing 
le = preprocessing.LabelEncoder()  

df['Sector'] = le.fit_transform(df['Sector'])
df['Employer'] = le.fit_transform(df['Employer'])
df['Job_Title '] = le.fit_transform(df['Job Title'])
df.drop("Job Title", axis = 1, inplace = True)

df.head()

In [None]:
x = df.drop(columns=['Taxable Benefits'])
x

In [None]:
y = df['Taxable Benefits']
y

In [None]:
print(x.dtypes)
print(y.dtypes)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=4)
print(x_train)
print(x_test)
print(y_train)
print(y_test)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, confusion_matrix

In [None]:
clfs = {
    'KNN': KNeighborsRegressor(),
    'MLP1': MLPRegressor(),
    'MLP2': MLPRegressor(hidden_layer_sizes=[100, 100]),
    'AdaBoost': AdaBoostRegressor(),
    'DecisionTree': DecisionTreeRegressor(),
    'RandomForest': RandomForestRegressor(),
    'GradientBoost': GradientBoostingRegressor(),
}

In [None]:
r2_scores = dict()
for clf_name in clfs:
    clf = clfs[clf_name]
    clf.fit(x_train, y_train.tolist())
    y_pred = clf.predict(x_test)
    r2_scores[clf_name] = r2_score(y_pred, y_test)
    print(clf_name, r2_scores[clf_name])

In [None]:
r2_scores = dict(sorted(r2_scores.items(), key = lambda kv:(kv[1], kv[0]), reverse= True))
v = list(r2_scores.keys())[0]
print("Classifier with high accuracy --> ",v)
print("With the accuracy of",r2_scores[v])

In [None]:
fig,ax=plt.subplots(figsize=(10,5))
sns.regplot(x=clfs[v].predict(x_test),y=y_test,marker="*")
plt.show()