This demo notebook aims to help analyze data of Stack Overflow Developer Survey 2020.

In [None]:
#Author : Jun 
#Date: 17/02/2020
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn import tree
import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans
import seaborn as sns

load data

In [None]:
survey = pd.read_csv("../input/developer-survey-2020/survey_results_public.csv")  # read data
survey.info() # data information

respondents in each country

In [None]:
countries = survey['Country'].value_counts()
countries.head(5)

In [None]:
import plotly.graph_objects as go
fig = go.Figure(data=go.Choropleth(
    locations = countries.index, 
    z = countries,
    locationmode = 'country names', 
    colorscale = 'Reds',    
))
fig.show()

selec two features

In [None]:
columns = ['EdLevel','Country','ConvertedComp'] # use two features. ConvertedComp is the target variable: salary
df = pd.DataFrame(survey, columns=columns) 
df.head(10)

EDA

In [None]:
df['EdLevel'].value_counts().plot(kind = 'pie', title = 'Education Level', autopct='%1.1f%%') # EDA 

In [None]:
# bad EDA. How to improve it?
df['Country'].value_counts().plot(kind = 'pie', title = 'Country', autopct='%1.1f%%') 

handle missing values

In [None]:
df.dropna(inplace=True) # drop all NaN values. Improvement: handle missing values
df.head(10)

In [None]:
df['EdLevel']  = df['EdLevel'].astype(str) #shorten description
df['EdLevel'] = df['EdLevel'].apply(lambda x: x[0:4])

EDA

In [None]:
grouped = df.groupby('EdLevel')   #EDA
grouped['ConvertedComp'].median().plot.barh();

create target class

In [None]:
salary_median = df['ConvertedComp'].median() #create the target variable y
y = df['ConvertedComp'].apply(lambda x:0 if x <= salary_median else 1) #1 for high, 0 for low
df['Income'] = y # income is a categorical variable
df.head(10)

create features

In [None]:
columns = ['EdLevel','Country'] #create the feature set X
X = pd.DataFrame(df, columns=columns) 
X.head(5)

encode

In [None]:
X = pd.get_dummies(X) #Improvement: OneHot encoding for nominal data. Ordinal encoder for ordinal data
X.head(10)

split into training and test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) #split data into training and test data. Improvement: usually need validation data for hyperparameter tuning

create decision tree

In [None]:
dt = tree.DecisionTreeClassifier(criterion='entropy', max_depth = 10, random_state=1) #decision tree for classification
dt = dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
print ("Accuracy: {0:.3f}".format(metrics.accuracy_score(y_test, y_pred)),"\n") #print accuracy


* Cluster analysis of low income class and EDA with crosstab in Lecture 04

In [None]:
grouped = df.groupby('Income')  #group data into low and high incomes
df2 = pd.DataFrame(grouped.get_group(0), columns=df.columns) # 0 for low income
df2.head(5)

EDA for cluster results

In [None]:
X2 = df2.drop(['ConvertedComp','Income'], axis=1)
X2 = pd.get_dummies(X2) #OneHot Encoding. Improvement: Edlevel belongs to ordinal data
km = KMeans(n_clusters=2) # 2 clusters for low income group 
y2 = km.fit_predict(X2) # labels of sample

In [None]:
df2['Cluster']=y2
churn_crosstab = pd.crosstab(df2['Cluster'],df2["EdLevel"],  normalize=True)
churn_crosstab.plot(kind = 'bar', grid=True) 
plt.show() 

In [None]:
# bad EDA. How to improve it?
churn_crosstab = pd.crosstab(df2['Cluster'],df2['Country'],  normalize=True)
churn_crosstab.plot(kind = 'bar', grid=True) 
plt.show() 

In [None]:
def select_countries(x):
    if x =="United States":
        x = "USA"
    elif x == "United Kingdom":
        x = "UK" 
    elif x == "India":
        x = "India" 
    elif x == "Germany":
        x = "Germany" 
    else:
        x = "Others"
    return x
    
df2['Country']  = df2['Country'].astype(str)
df2['Country'] = df2['Country'].apply(lambda x: select_countries(x))
df2.head(5)

In [None]:
churn_crosstab = pd.crosstab(df2['Cluster'],df2['Country'],  normalize=True)
churn_crosstab.plot(kind = 'bar', grid=True) 
plt.show() 

* cluster 1: bach degree; more India developers
* cluster 0: other degree; more other countries