In [1]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("survey_results_public.csv")

In [None]:
df.head()

In [None]:
df = df[["Country", "EdLevel", "YearsCodePro", "Employment", "ConvertedCompYearly"]]
df = df.rename({"ConvertedCompYearly": "Salary"}, axis = 1)
df.head()

In [None]:
df = df[df["Salary"].notnull()]
df.head()

In [None]:
df.info()

In [None]:
df = df.dropna()
df.isnull().sum()

In [None]:
print(df["Employment"].unique())

In [None]:
df = df[df["Employment"].str.contains("Employed, full-time", na=False)]
df = df.drop("Employment", axis = 1)
df.info()

In [None]:
df['Country'].value_counts()

In [None]:
country_map = shorten_categories(df.Country.value_counts(),400)
df['Country'] = df['Country'].map(country_map)
df.Country.value_counts()

In [None]:
fig, ax = plt.subplots(1,1,figsize = (8,4))
df.boxplot('Salary','Country',ax=ax)
plt.suptitle('Salary (USD) vs Country')
plt.title('')
plt.ylabel('Salary')
plt.xticks(rotation=90)
plt.show()


In [None]:
df["YearsCodePro"].unique()

In [14]:
def clean_exp(x):
    if x == 'More than 50 years':
        return 50
    if x == 'Less than 1 year':
        return 0.5
    return float(x)

df['YearsCodePro'] = df['YearsCodePro'].apply(clean_exp)
    
        

In [None]:
df['EdLevel'].head()

In [16]:
def clean_ed(x):
    # Normalize the apostrophe and strip any leading/trailing whitespace
    x = x.replace("’", "'").strip()

    # Focus on core terms, ignoring additional details in parentheses
    if "bachelor" in x.lower():
        return "Bachelor's degree"
    if "master" in x.lower():
        return "Master's degree"
    if "professional" in x.lower():
        return "Post grad"
    
    # Anything else will be categorized as "Less than Bachelor's"
    return "Less than Bachelor's"
df['EdLevel'] = df['EdLevel'].apply(clean_ed)


In [None]:
df['EdLevel'].unique()


In [None]:
from sklearn.preprocessing import LabelEncoder
le_education = LabelEncoder()
df['EdLevel'] = le_education.fit_transform(df['EdLevel'])
df['EdLevel'].unique()
                                                                                   

In [None]:
le_country = LabelEncoder()
df['Country'] = le_country.fit_transform(df['Country'])
df['Country'].unique()

In [20]:
X = df.drop("Salary", axis = 1)


y = df["Salary"]


In [None]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
linear_reg.fit(X, y.values)


In [22]:
y_pred = linear_reg.predict(X)

In [23]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
error = np.sqrt(mean_squared_error(y,y_pred))

In [None]:
error

In [None]:
from sklearn.tree import DecisionTreeRegressor
dec_tree_reg = DecisionTreeRegressor(random_state = 0)
dec_tree_reg.fit(X,y.values)

In [26]:
y_pred = dec_tree_reg.predict(X)

In [None]:
error = np.sqrt(mean_squared_error(y,y_pred))
print("${:,.02f}".format(error))

In [None]:
from sklearn.ensemble import RandomForestRegressor
random_forest_reg = RandomForestRegressor(random_state=0)
random_forest_reg.fit(X,y.values)


In [29]:
y_pred = random_forest_reg.predict(X)

In [None]:
error = np.sqrt(mean_squared_error(y,y_pred))
print("${:,.02f}".format(error))

In [None]:
from sklearn.model_selection import GridSearchCV

max_depth = [None, 2,4,6,8,10,12]
parameters = {"max_depth" : max_depth}

regressor = DecisionTreeRegressor(random_state = 0)
gs = GridSearchCV(regressor, parameters, scoring = 'neg_mean_squared_error')
gs.fit(X,y.values)

In [None]:
regressor = gs.best_estimator_
regressor.fit(X,y.values)
y_pred = regressor.predict(X)
error = np.sqrt(mean_squared_error(y,y_pred))
print("${:,.02f}".format(error))

In [None]:
# country,edlevel,yearsprocode

X = pd.DataFrame([['United States of America', "Master's degree", 15]],
                 columns=['Country', 'EdLevel', 'YearsCodePro'])
X

In [35]:
# Ensure that X is a pandas DataFrame, and then use iloc for indexing
X.iloc[:, 0] = le_country.transform(X.iloc[:, 0])
X.iloc[:, 1] = le_education.transform(X.iloc[:, 1])


In [36]:
# Ensure all values are float for model input
X = X.astype(float)

In [None]:
X

In [None]:
y_pred = regressor.predict(X)
y_pred

In [41]:
import pickle

In [42]:
data = {"model": regressor, "le_country": le_country, "le_education": le_education}
with open("saved_steps.pkl", "wb") as file:
    pickle.dump(data,file)

In [44]:
with open('saved_steps.pkl','rb') as file:
    data = pickle.load(file)

regressor_loaded = data["model"]
le_country = data["le_country"]
le_education = data["le_education"]


In [None]:
y_pred = regressor_loaded.predict(X)
y_pred