In [1]:

!pip install scikit-learn pandas numpy matplotlib seaborn




In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [5]:
!unzip /content/archive.zip -d /content/


Archive:  /content/archive.zip
  inflating: /content/freelancer_earnings - freelancer_earnings_vs_skillstack_dataset.csv  


!find /content -name "*.csv"


In [8]:
import pandas as pd

df = pd.read_csv("/content/freelancer_earnings - freelancer_earnings_vs_skillstack_dataset.csv")
df.head()


Unnamed: 0,freelancer_id,category,primary_skills,years_experience,experience_level,region,country,education,hourly_rate_usd,annual_income_usd,primary_platform
0,FL0001,AI/ML Engineering,"Computer Vision, TensorFlow, Data Science",1.6,junior,Middle East,Israel,Master,66.41,"$58,873.61",Direct Client
1,FL0002,Backend Development,"PHP, Ruby, Go",8.3,senior,Middle East,UAE,Bootcamp,112.79,"$108,183.05",Freelancer
2,FL0003,UI/UX Design,"Sketch, Wireframing, Adobe XD",3.6,mid,North America,US,Self-taught,94.21,"$88,681.22",Freelancer
3,FL0004,DevOps,"Kubernetes, Terraform, AWS",2.3,mid,North America,US,Self-taught,118.87,"$109,229.42",Upwork
4,FL0005,DevOps,"Azure, Docker, Kubernetes",8.8,senior,Latin America,Mexico,Self-taught,82.76,"$66,298.22",LinkedIn


In [9]:
# Remove $ and commas from annual_income_usd and convert to float
df['annual_income_usd'] = (
    df['annual_income_usd']
    .str.replace('$', '', regex=False)
    .str.replace(',', '', regex=False)
    .astype(float)
)

# Drop ID column (not useful for ML)
df.drop(columns=['freelancer_id'], inplace=True)

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   category           500 non-null    object 
 1   primary_skills     500 non-null    object 
 2   years_experience   500 non-null    float64
 3   experience_level   500 non-null    object 
 4   region             500 non-null    object 
 5   country            500 non-null    object 
 6   education          500 non-null    object 
 7   hourly_rate_usd    500 non-null    float64
 8   annual_income_usd  500 non-null    float64
 9   primary_platform   500 non-null    object 
dtypes: float64(3), object(7)
memory usage: 39.2+ KB


In [10]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [11]:
# Regression target
X_reg = df.drop(columns=['annual_income_usd'])
y_reg = df['annual_income_usd']

# Classification target
X_clf = df.drop(columns=['experience_level'])
y_clf = df['experience_level']


In [12]:
from sklearn.model_selection import train_test_split

Xr_train, Xr_test, yr_train, yr_test = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42
)


In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

Xr_train = scaler.fit_transform(Xr_train)
Xr_test = scaler.transform(Xr_test)

Xc_train = scaler.fit_transform(Xc_train)
Xc_test = scaler.transform(Xc_test)


In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lr = LinearRegression()
lr.fit(Xr_train, yr_train)

y_pred_lr = lr.predict(Xr_test)

print("Linear Regression")
print("MSE:", mean_squared_error(yr_test, y_pred_lr))
print("R2 Score:", r2_score(yr_test, y_pred_lr))


Linear Regression
MSE: 63950193.56852365
R2 Score: 0.975373189762595


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(Xc_train, yc_train)

y_pred_log = log_reg.predict(Xc_test)

print("Logistic Regression Accuracy:", accuracy_score(yc_test, y_pred_log))
print(classification_report(yc_test, y_pred_log))


Logistic Regression Accuracy: 0.97
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.95      0.97      0.96        40
           2       0.98      0.96      0.97        50

    accuracy                           0.97       100
   macro avg       0.98      0.98      0.98       100
weighted avg       0.97      0.97      0.97       100



In [16]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(Xc_train, yc_train)

y_pred_dt = dt.predict(Xc_test)

print("Decision Tree Accuracy:", accuracy_score(yc_test, y_pred_dt))


Decision Tree Accuracy: 1.0


In [17]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(Xc_train, yc_train)

y_pred_rf = rf.predict(Xc_test)

print("Random Forest Accuracy:", accuracy_score(yc_test, y_pred_rf))


Random Forest Accuracy: 1.0


In [18]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(Xc_train, yc_train)

y_pred_nb = nb.predict(Xc_test)

print("Naive Bayes Accuracy:", accuracy_score(yc_test, y_pred_nb))

Naive Bayes Accuracy: 0.92


In [19]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(Xc_train, yc_train)

y_pred_knn = knn.predict(Xc_test)

print("KNN Accuracy:", accuracy_score(yc_test, y_pred_knn))


KNN Accuracy: 0.74


In [20]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf')
svm.fit(Xc_train, yc_train)

y_pred_svm = svm.predict(Xc_test)

print("SVM Accuracy:", accuracy_score(yc_test, y_pred_svm))


SVM Accuracy: 0.92
