# Exercise 4: Data exploration and visualization

We will work with the "IT Salary Survey EU 2020" dataset <br> See: https://www.kaggle.com/datasets/parulpandey/2020-it-salary-survey-for-eu-region/

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
data_df = pd.read_csv("IT Salary Survey EU  2020.csv")
data_df.head()

In [None]:
# List all columns


## Explore programming languages

In [None]:
# Lets make a copy for us to work on
df = data_df.copy()
len(df)

In [None]:
# Check if values are missing (NaN)


In [None]:
# Show rows with missing values


In [None]:
# Remove missing values

len(df)

**$\rightarrow$ categorical/ nominal variable**

In [None]:
# List all (unique) programming languages


In [None]:
# Convert to lowercase


In [None]:
# Find top 10 languages

top10

In [None]:
languages = "|".join(l for l in top10.index)
languages

In [None]:
# extract strings using regular expressions
pattern = r"(javascript|java|python|php|c\+\+|c#|swift|scala|go|kotlin)"

df["Your main technology / programming language"] = df["Your main technology / programming language"].str.extract(pattern)

In [None]:
# Check top 10 again

top10

In [None]:
counts = top10.sort_values()
categories = counts.index
count = counts.values

In [None]:
# use plt.bar()

plt.xlabel("programming language")
plt.ylabel("count")
plt.show()

In [None]:
# horizonal barplot

plt.show()

### Preparing a plot for a (scientific) figure
**Size and font size**
* Page width: about 7 inches (= 180mm, typical DinA4 page with borders)
* We want to fit two plots next to each other $\rightarrow$ about 3 inches each
* We set the font size to 8
* The font should be editable in Inkscape!

**Improving visuals**
* Declutter = Remove unnecessary elements such as plot borders, grid lines etc.
* Draw attention to the important elements
* Avoid 3D effects
* Use color sparingly, avoid rainbow colormaps etc.

In [None]:
# Set font size
plt.rcParams['font.size'] = 12 # for now: 12, later: 8

# Make the text editable (in the SVG file)
plt.rcParams['svg.fonttype'] = 'none'

In [None]:
plt.figure(figsize=(2.5, 3))  # 2.5 to 3 inches width
plt.title("Top 10 programming languages")
plt.xlabel('Number of respondents')

# Horizontal barplot


# Add count next to each bar


# Remove chart border & y-axis (declutter!)


# Save as PNG and SVG


plt.show()

## Let's visualize the column "Age"

In [None]:
# Lets make a copy
df = data_df.copy()
len(df)

In [None]:
#df.Age.value_counts()
df.Age.describe()

**$\rightarrow$ discrete variable**

In [None]:
# Remove missing values

len(df)

In [None]:
bins = np.arange(20, 69, 2)
bins

In [None]:
# Histogram using plt.hist()

plt.show()

In [None]:
ret

In [None]:
# Seaborn: Easy to use for pandas DataFrames!

plt.show()

In [None]:
sns.set_style("ticks")  # styles: {darkgrid, whitegrid, dark, white, ticks}

In [None]:
sns.histplot(df, x='Age', bins=bins)

# Compute average age

print(mean_age)

# Highlight average age by plotting a vertical line

plt.legend()
plt.show()

In [None]:
# Width: 3 inches
plt.figure(figsize=(3, 2))
plt.title("Age distribution")

# Histogram
sns.histplot(df, x='Age', bins=bins)

# Highlight average age by plotting a vertical line


# Declutter: Remove the upper and right border


# Save as SVG
plt.savefig("plots/age.svg", bbox_inches="tight")
plt.show()

## Exploring age and total years of experience

In [None]:
# Let's make a copy
df = data_df.copy()
len(df)

In [None]:
# remove NaN's from "Total years of experience"
df = df[~df["Total years of experience"].isnull()]
len(df)

In [None]:
# Scatter plot: Years of experience vs. age
plt.figure(figsize=(12, 5))

_ = plt.xticks(rotation=90)

In [None]:
# Check datatype of the column


Pandas datatype **object (O)**: String or mixed numeric and non-numeric values

In [None]:
# replace non-numeric values in column "Total years of experience" with NaN


# remove NaN's from "Total years of experience" again
df = df[~df["Total years of experience"].isnull()]
len(df)

In [None]:
# Check datatype of the column
df["Total years of experience"].dtype

In [None]:
# plot it again
plt.figure(figsize=(12, 5))
plt.scatter(df["Total years of experience"], df["Age"])
plt.xlabel("Total years of experience")

**Remove outliers**

In [None]:
df["Total years of experience"].plot.hist(bins=100)
plt.xlabel("Total years of experience")
#plt.xlim(0,100)
plt.show()
# --> positively skewed distribution

boxplot = plt.boxplot(df["Total years of experience"], showfliers=True)
plt.ylabel("Years")
plt.show()

In [None]:
[item.get_ydata() for item in boxplot['whiskers']]

In [None]:
df['Total years of experience'].describe()

In [None]:
# Q3 + whis*(Q3-Q1)

upper_limit

In [None]:
# Q1 - whis*(Q3-Q1)

lower_limit

In [None]:
# Apply the limits to our DataFrame


In [None]:
# plot it again without outliers
plt.figure(figsize=(10, 5))
plt.scatter(df["Total years of experience"], df["Age"])
plt.xlabel("")
plt.show()

In [None]:
plt.figure(figsize=(3, 2))
plt.title("Age vs. years of experience")
plt.xlabel("Years of experience")
plt.ylabel("Age in years")

# Scatter plot
plt.scatter(df["Total years of experience"], df["Age"], s=10)

# Declutter
sns.despine()

# Save as SVG
plt.savefig("plots/age_experience.svg", bbox_inches="tight")
plt.show()

## Analyzing salary & seniority level
Now, let's explore the yearly brutto salary and how it differs for seniority levels. <br>
We will look at different plot types for visualizing distributions and for comparing distributions among groups.

In [None]:
# Let's make a copy
df = data_df.copy()

In [None]:
df = df.rename(columns={"Yearly brutto salary (without bonus and stocks) in EUR": "Yearly brutto salary in EUR"})

In [None]:
# Boxplot of salary distribution (here: using seaborn)


Outlier removal, as shown before:

In [None]:
df['Yearly brutto salary in EUR'].describe()

In [None]:
# Q3 + whis*(Q3 - Q1)
upper_limit = 80000 + 1.5*(80000 - 58800)

# Q1 - whis*(Q3 - Q1)
lower_limit = 58800 - 1.5*(80000 - 58800)

# Apply limits to DataFrame
df = df[(df["Yearly brutto salary in EUR"] <= upper_limit) &
        (df["Yearly brutto salary in EUR"] >= lower_limit)]

In [None]:
# Boxplot


# Highlight median


plt.show()

In [None]:
# Barplot with errorbar (standard deviation)


# Highlight mean


plt.show()

In [None]:
# Remove NaN's from column "Seniority level"
df = df[~df['Seniority level'].isnull()]

In [None]:
sns.boxplot(data=df, y="Yearly brutto salary in EUR", x="Seniority level")
plt.show()

In [None]:
# Find top 5 seniority levels

top5

In [None]:
# Remove rows, only keep top 5 seniority levels


In [None]:
# Boxplot

plt.show()

In [None]:
# Violinplot

plt.show()

In [None]:
# Barplot with errorbar

plt.show()

In [None]:
# Only errorbar

plt.show()

In [None]:
# Show raw values -> stripplot

plt.show()

In [None]:
# Show raw values -> swarmplot

plt.show()

In [None]:
# Order seniority levels by their median salary

order

In [None]:
plt.figure(figsize=(3.5, 2))
plt.title("Salary by seniority level")

# Boxplot

# Stripplot


# Declutter
sns.despine()

# Save as SVG
plt.savefig("plots/salary_seniority.svg", bbox_inches="tight")
plt.show()

## Bonus: Colormaps

### Qualitative colormaps
E.g. for showing different categories

In [None]:
sns.color_palette("tab10")

In [None]:
sns.color_palette("pastel")

In [None]:
sns.color_palette("Set1")

### Sequential colormaps


In [None]:
sns.color_palette("Blues", as_cmap=True)

### Perceptually uniform sequential colormaps
= equal steps in values are perceived as equal steps in the color space <br>
Great for displaying heatmaps for example

In [None]:
sns.color_palette("viridis", as_cmap=True)

In [None]:
sns.color_palette("plasma", as_cmap=True)

In [None]:
# Example: "Flights" dataset
flights = sns.load_dataset("flights")
sns.heatmap(flights.pivot(index="month", columns="year", values="passengers"), cmap="plasma")
plt.show()

### Diverging colormaps
Use these if there is a meaningful middle point. <br>
Example: Spearman correlation (-1: negative correlation, 0: no correlation, 1: positive correlation)

In [None]:
sns.color_palette("RdBu", as_cmap=True)

In [None]:
sns.color_palette("coolwarm", as_cmap=True)

**matplotlib colormaps** and when to use which: https://matplotlib.org/stable/users/explain/colors/colormaps.html