# Importing the packages

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import copy
import matplotlib.pyplot as plt
import plotly
import seaborn as sns
import plotly.graph_objects as go
!pip install pygal
# Importing pygal and its styles
import pygal
from pygal.style import Style

from IPython.display import display, HTML
from datetime import datetime, timedelta
from mlxtend.preprocessing import TransactionEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


from sklearn.model_selection import train_test_split
plotly.offline.init_notebook_mode (connected = True)
plt.rcParams['figure.dpi'] = 500


!pip install apyori
from apyori import apriori


In [None]:
# This is helper function to render plot in html format
base_html = """
<!DOCTYPE html>
<html>
  <head>
  <script type="text/javascript" src="http://kozea.github.com/pygal.js/javascripts/svg.jquery.js"></script>
  <script type="text/javascript" src="https://kozea.github.io/pygal.js/2.0.x/pygal-tooltips.min.js""></script>
  </head>
  <body>
    <figure>
      {rendered_chart}
    </figure>
  </body>
</html>
"""

def pygalplot(chart):
    rendered_chart = chart.render(is_unicode=True)
    plot_html = base_html.format(rendered_chart=rendered_chart)
    display(HTML(plot_html))

# Importing the data and having the first look of the data

In [None]:
# Importing the data
data=pd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv',low_memory=False)

# Getting the first 20 columns and some more specific columns from the data
columnss=list(data.columns[:20])
columnsss=['Q15','Q20','Q21','Q24','Q25']
columnss.extend(columnsss)

# Filtering the data
data=data[columnss]

# Code to change the name of the columns
data.columns=data.iloc[0,:].values

# code to remove the first row
data.drop(index=0,inplace=True)

# So we are dropping all the rows with nan values in the first 5 columns
data.dropna(subset=data.columns[:6],inplace=True)
data.dropna(subset=data.columns[20:],inplace=True)

# Having a look at the data
data.head()

# Number of Responses of the survey all over the world

In [None]:
# here we gonna put all the names of the countries in the variable a and all the values of the counts in 
# variable b
a=data['In which country do you currently reside?'].value_counts().index
b=data['In which country do you currently reside?'].value_counts().values

a=list(a)
b=list(b)



fig = go.Figure(data=go.Choropleth(
    locations=a, # Spatial coordinates
    z = b, # Data to be color-coded
    locationmode = 'country names', # set of locations match entries in `locations`
    colorscale = 'Reds',
    colorbar_title = "Total Number",
))

fig.update_layout(
    title_text = 'Data Analysis on the basis of number of survey in different countries',
    geo_scope='world', # limite map scope to USA
)

fig.show()

# Having a look at the Response of certaing age groups

In [None]:
# Grouping the data on the basis of age and gender
gg=data.groupby(['What is your age (# years)?','What is your gender? - Selected Choice']).count()

# getting the array of the index of the grouped dataframe
indexx=np.array(gg.index)

# Making a dataframe
age_group=pd.DataFrame()

indexx=np.array(list(indexx))

age_group['Age']=indexx[:,0]
age_group['Gender']=indexx[:,1]
age_group['Number']=list(gg['Duration (in seconds)'].values)

# Plotting the plot
px.bar(data_frame=age_group,x='Age',y='Number',color='Gender',template='plotly_dark')


# Having a look at the duration of the Survey and how much time it takes to respond to it

In [None]:
sns.set_style("darkgrid")
pt=sns.distplot(data['Duration (in seconds)'])
pt.set(xlim=(0,200000),ylim={0,0.000001})

# Having a look at the composition of People who gave the survey

In [None]:
nam=data['Select the title most similar to your current role (or most recent title if retired): - Selected Choice'].value_counts()
px.pie(values=nam.values,names=nam.index)

# Having a look at the Salary Composition of the People who gave the survey

In [None]:
salary_order = ['$0-999', '1,000-1,999', '2,000-2,999', '3,000-3,999', 
                '4,000-4,999', '5,000-7,499', '7,500-9,999', '10,000-14,999',
                '15,000-19,999', '20,000-24,999', '25,000-29,999', 
                '30,000-39,999', '40,000-49,999', '50,000-59,999', 
                '60,000-69,999', '70,000-79,999', '80,000-89,999', 
                '90,000-99,999', '100,000-124,999', '125,000-149,999',
                '150,000-199,999', '200,000-249,999', '250,000-299,999', 
                '300,000-500,000', '> $500,000']
salary = data['What is your current yearly compensation (approximate $USD)?'].fillna('unknown').value_counts()[salary_order]

px.bar(x = salary, 
         y = salary.index,labels={'x':'Total Number','y':'Salary Range of the person'})

# Checking the count by Sex and Age

In [None]:
max_salary = data['What is your current yearly compensation (approximate $USD)?'].fillna('unknown-unknown') \
    .apply(lambda x: x.replace('$', '') \
    .replace('> 500,000', '500,000-500,000') \
    .replace(',', '') \
    .split("-")[1]).replace('unknown', np.nan).astype('float64') + 1

new_df = pd.DataFrame({'max_salary': max_salary, 'age': data['What is your age (# years)?']})
new_df['sex'] = data['What is your gender? - Selected Choice']

hm = pd.DataFrame(new_df[new_df.sex.isin(['Man', 'Woman'])][['sex', 'age']] \
                  .value_counts()).reset_index().pivot('sex', 'age', 0) \
                  .fillna(0).astype('int')

plt.figure(figsize=(10, 5))
plt.title('Count by sex and age', size = 15, fontweight = 'bold', fontfamily = 'serif')
sns.heatmap(hm, annot = True, fmt = "d", linewidths=.5)
plt.xlabel('Age', fontfamily = 'serif')
plt.ylabel('Sex', fontfamily = 'serif')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')
plt.show()

# Checking the count by position and Education

In [None]:
education_order = ['No formal education past high school', 
              'Some college/university study without earning a bachelor’s degree',
              'Professional degree', 'Bachelor’s degree', 
              'Master’s degree', 'Doctoral degree']
new_df['education'] = data['What is the highest level of formal education that you have attained or plan to attain within the next 2 years?']

new_df['position'] = data['Select the title most similar to your current role (or most recent title if retired): - Selected Choice']

new_df['experience'] = data['For how many years have you been writing code and/or programming?']

hm = pd.DataFrame(new_df[['position', 'education']].value_counts()) \
    .reset_index().pivot('position', 'education', 0) \
    .fillna(0).astype('int')[education_order]

plt.figure(figsize=(10, 6))
plt.title('Count by position and education', size = 15, 
          fontweight = 'bold', fontfamily = 'serif')
sns.heatmap(hm, annot = True, fmt = "d", linewidths=.5)

plt.xlabel('Education', fontfamily = 'serif')
plt.ylabel('Position', fontfamily = 'serif')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')
plt.show()

# Salary and Age Distribution

In [None]:
data['What is your age (# years)?'].unique()

In [None]:
salary=[]
for i in data['What is your age (# years)?'].unique():
    t=[]
    for j in data[data['What is your age (# years)?']==i]['What is your current yearly compensation (approximate $USD)?'].values:
        if j=='> $500,000':
            t.append(500000)
        else:
            k=int(j.split('-')[1].replace(',',''))
            t.append(k)
    salary.append(t)


In [None]:
box_plot = pygal.Box(box_mode='tukey')
box_plot.title = 'Salar vs Age'
x=data['What is your age (# years)?'].unique()
for i in range(len(x)):
    box_plot.add(x[i],salary[i])
pygalplot(box_plot)

# Salary Composition Of Data Scientists :)

In [None]:
group=data[data['Select the title most similar to your current role (or most recent title if retired): - Selected Choice']=='Data Scientist'].groupby('What is your current yearly compensation (approximate $USD)?').count()
gauge_chart = pygal.Gauge(human_readable=True)
gauge_chart.title = 'Composition of Salary Of Data Scientists'
gauge_chart.range = [0, max(group['Select the title most similar to your current role (or most recent title if retired): - Selected Choice'])]
for i in range(len(group)):
    gauge_chart.add(group.index[i],group['Select the title most similar to your current role (or most recent title if retired): - Selected Choice'].values[i])
pygalplot(gauge_chart)

# Tree Map of Salary Vs Education

In [None]:
salary=[]
xx=data['What is the highest level of formal education that you have attained or plan to attain within the next 2 years?'].unique()
for i in xx:
    t=[]
    for j in data[data['What is the highest level of formal education that you have attained or plan to attain within the next 2 years?']==i]['What is your current yearly compensation (approximate $USD)?'].values:
        if j=='> $500,000':
            t.append(500000)
        else:
            k=int(j.split('-')[1].replace(',',''))
            t.append(k)
    salary.append(t)


In [None]:
treemap = pygal.Treemap()
treemap.title = 'Salary Vs Education'
for i in range(len(xx)):
    treemap.add(xx[i],salary[i])
pygalplot(treemap)


# Let's have a look at whats the mean salary of people with different education levels :)

In [None]:
mm=[]
for i in salary:
    mm.append(np.array(i).mean())


In [None]:
gauge = pygal.SolidGauge(inner_radius=0.70)

for i in range(len(xx)):
    gauge.add(xx[i],[{'value': mm[i], 'max_value': 100000}])
pygalplot(gauge)

# Salary By Experience

In [None]:
salarys=[]
xxx=['< 1 years','1-2 years','3-5 years','5-10 years','10-20 years','20+ years']
for i in xxx:
    t=[]
    for j in data[data['For how many years have you been writing code and/or programming?']==i]['What is your current yearly compensation (approximate $USD)?'].values:
        if j=='> $500,000':
            t.append(500000)
        else:
            k=int(j.split('-')[1].replace(',',''))
            t.append(k)
    salarys.append(t)

In [None]:
funnel_chart = pygal.Funnel()
funnel_chart.title = 'Salary By Experience'
for i in range(len(xxx)):
    funnel_chart.add(xxx[i],salarys[i])
pygalplot(funnel_chart)

# Age Distribution Of the Data

In [None]:
fig, ax = plt.subplots(1,1, figsize=(15,6))
data_q1 = data['What is your age (# years)?'].value_counts().sort_index()
ax.bar(data_q1.index, data_q1, width=0.55, 
       edgecolor='darkgray', color='#d4dddd',
      linewidth=0.7)

for i in data_q1.index:
    ax.annotate(f"{data_q1[i]}",
                xy=(i, data_q1[i] + 100),
               va ='center', ha='center', fontweight='light',
               fontfamily='serif', color='#4a4a4a')
    
for s in ['top', 'left', 'right']:
    ax.spines[s].set_visible(False)
    
ax.set_ylim(0,4200)
ax.set_xticklabels(data_q1.index, fontfamily='serif')
ax.set_yticklabels(np.arange(0, 4001, 500), fontfamily='serif')
fig.text(0.1, 0.95, 'Age Distribution', fontsize=15, fontweight='bold',
        fontfamily='serif')
ax.grid(axis='y', linestyle='-', alpha=0.4)
plt.show()

# Age/Gender Distribution

In [None]:
data1=copy.deepcopy(data)
data1['What is your gender? - Selected Choice'] = data['What is your gender? - Selected Choice'].apply(lambda x : 'ETC' if x not in ['Man', 
                                                             'Woman']
                              else x)
data_q1q2=data1[data1['What is your gender? - Selected Choice'] != 'ETC'].groupby(['What is your gender? - Selected Choice'])['What is your age (# years)?'].value_counts().unstack().sort_index()
man = data_q1q2.loc['Man']
woman = -data_q1q2.loc['Woman']

fig, ax = plt.subplots(1,1, figsize=(15,6))
ax.bar(man.index, man, width=0.55, color='#004c70', alpha=0.8,
      label='Male')
ax.bar(woman.index, woman, width=0.55, color='#990000', alpha=0.8, 
      label='Female')
ax.set_ylim(-1200,3500)

for i in man.index:
    ax.annotate(f"{man[i]}",
               xy=(i, man[i] + 100),
               va = 'center', ha='center', fontweight='light',
               fontfamily='serif', color='#4a4a4a')
    
for i in woman.index:
    ax.annotate(f"{-woman[i]}",
               xy=(i, woman[i] - 100),
               va = 'center', ha='center', fontweight='light',
               fontfamily='serif', color='#4a4a4a')
    
for s in ['top', 'left', 'right', 'bottom']:
    ax.spines[s].set_visible(False)
    
ax.set_xticklabels(data_q1q2.columns, fontfamily='serif')
ax.set_yticks([])
ax.legend()
fig.text(0.16, 0.95, 'Age / Gender Distribution', fontsize=15,
        fontweight='bold', fontfamily='serif')
plt.show()

# Converting Categorical to numerical values

In [None]:
# Let's compact the data
df2=copy.deepcopy(data)

df2=df2.fillna('nan')

num=[]
for i in df2.iloc[:,7:].values:
    count=0
    for j in i:
        if j=='nan':
            pass
        else:
            count+=1
    num.append(count)
df2=df2.iloc[:,:7]
df2['Total number of languages known']=num
df2.sort_values('Total number of languages known',ascending=False).head()


In [None]:
cou=df2.columns[1:7]
def factorize(name,data):
    for i in name:
        data[i]=pd.factorize(data[i])[0]
    return data
df2=factorize(cou,df2)

df2.columns=['Duration','Age','Gender','Country','Education','Title','Experience','Total number of languages known ']

df2.isna().sum()


# Doing Clustering on Age , Experience and Number of languages known

In [None]:
X=df2[['Age','Experience','Total number of languages known ']].values

In [None]:
#Building the Model
#KMeans Algorithm to decide the optimum cluster number , KMeans++ using Elbow Mmethod
#to figure out K for KMeans, I will use ELBOW Method on KMEANS++ Calculation
from sklearn.cluster import KMeans
wcss=[]

In [None]:
for i in range(1,11):
    kmeans = KMeans(n_clusters= i, init='k-means++', random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

    #inertia_ is the formula used to segregate the data p

In [None]:
#Visualizing the ELBOW method to get the optimal value of K 
plt.plot(range(1,11), wcss,'bx-')
plt.title('The Elbow Method')
plt.xlabel('no of clusters')
plt.ylabel('wcss')
plt.show()

In [None]:
# Number of the clusters looks like 3

In [None]:
kmeans = KMeans(n_clusters= 3, init='k-means++', random_state=0)
kmeans.fit(X)

In [None]:
clusters=kmeans.fit_predict(X)

In [None]:
px.scatter_3d(df2,x='Age',y='Experience',z='Total number of languages known ',color=clusters)

# Doing clustering on the basis of all the data

In [None]:
X=df2.values

In [None]:
wcss=[]
for i in range(1,11):
    kmeans = KMeans(n_clusters= i, init='k-means++', random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

    #inertia_ is the formula used to segregate the data p

In [None]:
#Visualizing the ELBOW method to get the optimal value of K 
plt.plot(range(1,11), wcss,'bx-')
plt.title('The Elbow Method')
plt.xlabel('no of clusters')
plt.ylabel('wcss')
plt.show()

Still the number of clusters is 3

In [None]:
kmeans = KMeans(n_clusters= 3, init='k-means++', random_state=0)
kmeans.fit(X)

In [None]:
clusters=kmeans.fit_predict(X)

In [None]:
# Adding cluster column in the dataframe
df2['Clusters']=clusters

In [None]:
# Having a look at the file 
df2.head()

# Making the dataframe for apriori

In [None]:
apriori_data=data.iloc[1:,7:20]

In [None]:
apriori_data.columns=[i for i in range(len(apriori_data.columns))]


In [None]:
apriori_data.head()


# Trying out the functions

In [None]:
records = []
for i in range(0, len(apriori_data)):
    records.append([str(apriori_data.values[i, j]) for j in range(0, 10)])
    
freq_langs = apriori(records, min_support=0.0045, min_confidence=0.2, min_lift=3, min_length=2)
results = list(freq_langs)

print("There are {} Relation derived.".format(len(results)))


In [None]:
lift = []
association = []
for i in range (0, len(results)):
    lift.append(results[:len(results)][i][2][0][3])
    association.append(list(results[:len(results)][i][0]))
    
rank = pd.DataFrame([association, lift]).transpose()
rank.columns = ['Association', 'Lift']

# Show top 10 higher lift scores
rank.sort_values('Lift', ascending=False).head(10)

In [None]:
# Putting all transactions in a single list
langs = []
for i in range(0, len(records)):
    langs.extend(records[i])

# Finding unique items from transactions
uniquelangs = list(set(langs))

uniquelangs


### Trying Eclat on pairs of languages

In [None]:
# remove nan values:-
record = []
for sublist in records:
    clean_sublist = [item for item in sublist if item is not np.nan]
    record.append(clean_sublist)
    
#In eclat we have to change data to 0 1 format :-

te = TransactionEncoder()
te_ary = te.fit(record).transform(record)
df_x = pd.DataFrame(te_ary, columns=te.columns_) # encoding

pair = []
for j in range(0, len(uniquelangs)):
    k = 1;
    while k <= len(uniquelangs):
        try:
            pair.append([uniquelangs[j], uniquelangs[j+k]])
        except IndexError:
            pass
        k = k + 1;
pair
pairs=[]
for i in pair :
    if 'nan' in i:
        continue
        
    else:
        pairs.append(i)
score = []
for i in pairs:
    cond = []
    for lang in i:
        cond.append('("%s") in s' %lang)
    mycode = ('[s for s in record if ' + ' and '.join(cond) + ']')
    #mycode = "print 'hello world'"
    score.append(len(eval(mycode))/len(apriori_data))
ranking_ECLAT = pd.DataFrame([pairs, score]).transpose()
ranking_ECLAT.columns = ['Pair', 'Score']

ranking_ECLAT.sort_values('Score', ascending=False).head(10)


### Trying out ECLAT on trios of languages

In [None]:
# Creating trios
trio = []
for j in range(0, len(uniquelangs)):
    for k in range(j, len(uniquelangs)):
        for l in range(k, len(uniquelangs)):
            if (k != j) and (j != l) and (k != l):
                try:
                    trio.append([uniquelangs[j], uniquelangs[j+k], uniquelangs[j+l]])
                except IndexError:
                    pass 

trios=[]
for i in trio:
    if 'nan' in i:
        continue
    else:
        trios.append(i)

score_trios = []
for i in trios:
    cond = []
    for lang in i:
        cond.append('("%s") in s' %lang)
    mycode = ('[s for s in record if ' + ' and '.join(cond) + ']')
    #mycode = "print 'hello world'"
    score_trios.append(len(eval(mycode))/len(apriori_data))
ranking_ECLAT_trios = pd.DataFrame([trios, score_trios]).transpose()
ranking_ECLAT_trios.columns = ['Trio', 'Score']

ranking_ECLAT_trios.sort_values('Score', ascending=False).head(10)


# Let's Try to make a Salary prediction System :)

## Step 1 .. Let's Select Some Features

In [None]:
data.drop(columns=data.columns[7:20],inplace=True)

In [None]:
data.head()

## Step 2 converting categorical to numerical

In [None]:
df3=copy.deepcopy(data)
df3=factorize(df3.columns[1:],df3)


In [None]:
X=df3.drop(columns=['What is your current yearly compensation (approximate $USD)?']).values

In [None]:
y=df3['What is your current yearly compensation (approximate $USD)?'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)

In [None]:
dc=RandomForestClassifier()
dc.fit(X_train,y_train)

In [None]:
print('The accuracy score is :',accuracy_score(dc.predict(X_test),y_test))