In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
import joblib
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet


In [2]:
df = pd.read_csv("salary1.csv")

In [3]:
df

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,29,Male,Bachelor's,Account Manager,2,600000
1,25,Male,Bachelor's,Account Manager,2,600000
2,26,Male,Bachelor's,Account Manager,2,600000
3,25,Female,Bachelor's,Account Manager,2,600000
4,27,Female,Bachelor's,Account Manager,2,600000
...,...,...,...,...,...,...
287,27,Male,Master's,Web Developer,1,900000
288,32,Female,Master's,Web Developer,5,1300000
289,34,Male,Master's,Web Developer,8,1600000
290,37,Male,Master's,Web Developer,11,1900000


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292 entries, 0 to 291
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Age                  292 non-null    int64 
 1   Gender               292 non-null    object
 2   Education Level      292 non-null    object
 3   Job Title            292 non-null    object
 4   Years of Experience  292 non-null    int64 
 5   Salary               292 non-null    int64 
dtypes: int64(3), object(3)
memory usage: 13.8+ KB


In [5]:
df[df.duplicated()]

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
144,36,Male,Bachelor's,Marketing Analyst,13,2100000


In [6]:
df_rm = df.drop_duplicates(keep="first")

In [7]:
df_rm.isnull().sum()

Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64

In [8]:
df_pr = df_rm.dropna(how="any")

In [9]:
def augment_data(df):
    augmented_data = []
    for job_title, group in df.groupby('Job Title'):
        min_experience = group['Years of Experience'].min()
        max_experience = group['Years of Experience'].max()
        min_salary = group['Salary'].min()
        max_salary = group['Salary'].max()
        
        for _ in range(100):
            experience = np.random.randint(min_experience, max_experience + 1)
            
            # Calculate proportional increase in salary based on experience
            experience_ratio = (experience - min_experience) / (max_experience - min_experience)
            new_salary = min_salary + (max_salary - min_salary) * experience_ratio
            
            new_entry = group.iloc[0].copy()
            new_entry['Years of Experience'] = experience
            new_entry['Salary'] = new_salary
            augmented_data.append(new_entry)
    return pd.DataFrame(augmented_data)

# Augment data
augmented_df = augment_data(df_pr)
print(augmented_df.head())

   Age Gender Education Level        Job Title  Years of Experience  \
0   29   Male      Bachelor's  Account Manager                    3   
0   29   Male      Bachelor's  Account Manager                    3   
0   29   Male      Bachelor's  Account Manager                    1   
0   29   Male      Bachelor's  Account Manager                    1   
0   29   Male      Bachelor's  Account Manager                    5   

         Salary  
0  9.333333e+05  
0  9.333333e+05  
0  6.000000e+05  
0  6.000000e+05  
0  1.266667e+06  


In [10]:
augmented_data = []

for title in df_pr['Job Title'].unique():
    title_df = df_pr[df_pr['Job Title'] == title]
    average_salary = title_df['Salary'].mean()
    
    for index, row in title_df.iterrows():
        age = row['Age']
        experience = row['Years of Experience']
        salary = row['Salary']
        
        # Add new data points with age and experience modifications
        for _ in range(15):  # Generate 10 augmented samples for each original sample
            # Modify age and experience
            new_age = max(np.random.normal(age, 2), 20)  # Randomly change age with a normal distribution
            new_experience = max(np.random.normal(experience, 1), 0)  # Randomly change experience with a normal distribution
            
            # Generate new salary based on age and experience
            new_salary = average_salary * (1 + 0.02 * (new_age - age) + 0.05 * (new_experience - experience))
            
            # Append the augmented data point
            augmented_data.append({
                "Age": new_age,
                "Gender": row['Gender'],
                "Education Level": row['Education Level'],
                "Job Title": row['Job Title'],
                "Years of Experience": new_experience,
                "Salary": new_salary
            })

# Create DataFrame from augmented data
augmented_df = pd.DataFrame(augmented_data)

# Combine original and augmented data
combined_df = pd.concat([df_pr, augmented_df], ignore_index=True)

In [11]:
df_pr = augmented_df.copy()

In [12]:
df_pr

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,31.547072,Male,Bachelor's,Account Manager,1.610179,9.231481e+05
1,31.594071,Male,Bachelor's,Account Manager,2.794760,9.769994e+05
2,29.612964,Male,Bachelor's,Account Manager,1.739923,8.943336e+05
3,29.052778,Male,Bachelor's,Account Manager,1.113775,8.562862e+05
4,30.908302,Male,Bachelor's,Account Manager,1.471252,9.054971e+05
...,...,...,...,...,...,...
4360,41.762352,Male,Master's,Web Developer,16.161030,1.625344e+06
4361,44.514744,Male,Master's,Web Developer,16.976733,1.780593e+06
4362,40.330348,Male,Master's,Web Developer,16.966374,1.644180e+06
4363,44.842504,Male,Master's,Web Developer,16.139123,1.723366e+06


In [13]:
labelEncoder = LabelEncoder()

df_pr["Gender_Encoder"] = labelEncoder.fit_transform(df_pr["Gender"])
df_pr["Education_Level_Encoder"] = labelEncoder.fit_transform(df_pr["Education Level"])
education_list = {title: encoding for title, encoding in zip(labelEncoder.classes_, labelEncoder.transform(labelEncoder.classes_))}
df_pr["Job_Title_Encoder"] = labelEncoder.fit_transform(df_pr["Job Title"])
job_title_encoding_dict = {title: encoding for title, encoding in zip(labelEncoder.classes_, labelEncoder.transform(labelEncoder.classes_))}

In [14]:
education_list

{"Bachelor's": 0, "Master's": 1}

In [15]:
job_title_encoding_dict

{'Account Manager': 0,
 'Accountant': 1,
 'Blockchain Developer': 2,
 'Business Analyst': 3,
 'Business Intelligence Analyst': 4,
 'Cloud Engineer': 5,
 'Data Analyst': 6,
 'Data Scientist': 7,
 'Financial Analyst': 8,
 'Financial Manager': 9,
 'Graphic Designer': 10,
 'HR Manager': 11,
 'IOT Developer': 12,
 'Interior Designer': 13,
 'ML Engineer': 14,
 'Marketing Analyst': 15,
 'Marketing Manager': 16,
 'Network Engineer': 17,
 'Operations Analyst': 18,
 'Operations Manager': 19,
 'Product Designer': 20,
 'Product Manager': 21,
 'Sales Executive': 22,
 'Sales Manager': 23,
 'Software Developer': 24,
 'Software Engineer': 25,
 'Supply Chain Manager': 26,
 'UX Designer': 27,
 'Web Developer': 28}

In [16]:
joblib.dump(job_title_encoding_dict, 'job.pkl')

['job.pkl']

In [17]:
df_pr.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Gender_Encoder,Education_Level_Encoder,Job_Title_Encoder
0,31.547072,Male,Bachelor's,Account Manager,1.610179,923148.097871,1,0,0
1,31.594071,Male,Bachelor's,Account Manager,2.79476,976999.365662,1,0,0
2,29.612964,Male,Bachelor's,Account Manager,1.739923,894333.597498,1,0,0
3,29.052778,Male,Bachelor's,Account Manager,1.113775,856286.160095,1,0,0
4,30.908302,Male,Bachelor's,Account Manager,1.471252,905497.144647,1,0,0


In [18]:
stScaler = StandardScaler()
df_pr["age_scale"] = stScaler.fit_transform(df_pr[["Age"]])
df_pr["experience_scale"] = stScaler.fit_transform(df_pr[["Years of Experience"]])

In [19]:
joblib.dump(stScaler, "scaler.pkl")

['scaler.pkl']

In [20]:
df_pr.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Gender_Encoder,Education_Level_Encoder,Job_Title_Encoder,age_scale,experience_scale
0,31.547072,Male,Bachelor's,Account Manager,1.610179,923148.097871,1,0,0,-0.575981,-1.263009
1,31.594071,Male,Bachelor's,Account Manager,2.79476,976999.365662,1,0,0,-0.569156,-1.087755
2,29.612964,Male,Bachelor's,Account Manager,1.739923,894333.597498,1,0,0,-0.856835,-1.243814
3,29.052778,Male,Bachelor's,Account Manager,1.113775,856286.160095,1,0,0,-0.93818,-1.336451
4,30.908302,Male,Bachelor's,Account Manager,1.471252,905497.144647,1,0,0,-0.668737,-1.283563


In [21]:
df_pr["Education Level"].unique()

array(["Bachelor's", "Master's"], dtype=object)

In [22]:
jj = df_pr["Job Title"].unique()

In [23]:
# Create a dictionary with default value of 0 for all keys
job_title_dict = {title: 0 for title in jj}

# Printing the dictionary
print(job_title_dict)

{'Account Manager': 0, 'Accountant': 0, 'Blockchain Developer': 0, 'Business Analyst': 0, 'Business Intelligence Analyst': 0, 'Cloud Engineer': 0, 'Data Analyst': 0, 'Data Scientist': 0, 'Financial Analyst': 0, 'Financial Manager': 0, 'Graphic Designer': 0, 'HR Manager': 0, 'Interior Designer': 0, 'IOT Developer': 0, 'Marketing Analyst': 0, 'Marketing Manager': 0, 'ML Engineer': 0, 'Network Engineer': 0, 'Operations Analyst': 0, 'Operations Manager': 0, 'Product Designer': 0, 'Product Manager': 0, 'Sales Executive': 0, 'Sales Manager': 0, 'Software Developer': 0, 'Software Engineer': 0, 'Supply Chain Manager': 0, 'UX Designer': 0, 'Web Developer': 0}


In [24]:
X = df_pr[["Gender_Encoder", "Education_Level_Encoder", "Job_Title_Encoder", "age_scale","experience_scale"]]
Y = df_pr["Salary"]

In [25]:
lr_model = LinearRegression()
rf_model = RandomForestRegressor(n_estimators=500,random_state=42)
gbr_model = DecisionTreeRegressor(max_depth=500)
gbm_model = GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=42)
elasticnet_model = ElasticNet(alpha=1.0, l1_ratio=0.5)


In [26]:
x_train, x_test , y_train, y_test = train_test_split(X,Y, test_size=0.2,random_state=42)

In [27]:
lr_model.fit(x_train,y_train)
rf_model.fit(x_train,y_train)
gbr_model.fit(x_train,y_train)
gbm_model.fit(x_train,y_train)
elasticnet_model.fit(x_train,y_train)

In [28]:
lr_pr = lr_model.predict(x_test)
rf_pr = rf_model.predict(x_test)
gbr_pr = gbr_model.predict(x_test)
gbm_pr = gbm_model.predict(x_test)
elasticnet_model = elasticnet_model.predict(x_test)

In [29]:
gbm_model

In [30]:
round(mean_squared_error(y_test, lr_pr),2)

166087833943.14

In [31]:
round(mean_squared_error(y_test, rf_pr),2)

8146853654.25

In [32]:
round(mean_squared_error(y_test, gbr_pr),2)

14119101965.01

In [33]:
round(r2_score(y_test, elasticnet_model),4)*100

36.38

In [34]:
round(r2_score(y_test, gbm_pr),4)*100

94.12

In [35]:
round(r2_score(y_test, rf_pr),4)*100

97.00999999999999

In [36]:
round(r2_score(y_test, gbr_pr),4)*100

94.82000000000001

In [37]:
joblib.dump(gbm_model,"lr.pkl")

['lr.pkl']

In [38]:
joblib.dump(rf_model,"rf.pkl")

['rf.pkl']

In [39]:
joblib.dump(gbr_model,"dt.pkl")

['dt.pkl']

In [40]:
stScaler.transform([[50]])[0][0]



5.896089443712768

In [41]:
rf_model.predict([[stScaler.transform([[30]])[0][0],0,0,7,stScaler.transform([[1]])[0][0]]])[0] - rf_model.predict([[stScaler.transform([[30]])[0][0],0,0,7,stScaler.transform([[3]])[0][0]]])[0]



-72485.1033975851

In [42]:
for i in range(30):
    print(i, gbm_model.predict([[stScaler.transform([[30]])[0][0],0,0,0,stScaler.transform([[i]])[0][0]]])[0])

0 895111.5034828357
1 895111.5034828357
2 914981.8625055705
3 939836.6280080621
4 943507.1495181097
5 943507.1495181097
6 958965.5432717984
7 954254.6897523516
8 947537.4647902296
9 954849.5482096816
10 950236.6404818327
11 950236.6404818327
12 1013614.9306108969
13 1099003.3370016746
14 941308.8164324934
15 948890.5857474097
16 979948.8064394165
17 987419.1750060951
18 987419.1750060951
19 987419.1750060951
20 987419.1750060951
21 987419.1750060951
22 992810.4485739812
23 992810.4485739812
24 1050256.258633787
25 1056682.3946906375
26 1080768.78924954
27 1080768.78924954
28 1080768.78924954
29 1080768.78924954




In [43]:
gbm_pr.perdict()

AttributeError: 'numpy.ndarray' object has no attribute 'perdict'

In [None]:
[theme]
primaryColor = "#d33682"
backgroundColor = "#002b36"
secondaryBackgroundColor = "#586e75"
textColor = "#fff"

NameError: name 'theme' is not defined