In [885]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [886]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')
df_solution = pd.read_csv('../data/solution_format.csv')

In [887]:
df_train.head()

Unnamed: 0,obs,job_title,job_posted_date,salary_category,job_state,feature_1,feature_2,feature_3,feature_4,feature_5,...,job_desc_291,job_desc_292,job_desc_293,job_desc_294,job_desc_295,job_desc_296,job_desc_297,job_desc_298,job_desc_299,job_desc_300
0,1,Others,2024/07,High,NY,A,0.6429,False,False,True,...,-0.362079,-0.499308,-0.367894,-0.214881,0.01487,-0.271177,-0.113347,-0.587955,-0.919095,-0.20734
1,2,Job_Title_1,2024/07,Low,CA,A,0.4678,False,False,False,...,-0.300989,-0.415411,-0.341824,-0.319064,0.042322,-0.124755,0.023489,-0.893224,-0.823024,0.112364
2,3,Others,2024/07,Low,CA,A,0.461,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Others,2024/07,Low,CA,A,0.5064,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Others,2024/07,Low,CA,A,0.464,False,False,False,...,-0.406159,-0.654657,-0.074398,-0.464479,0.081037,-0.136992,-0.27627,-0.696853,-0.601466,0.089939


In [888]:
df_test.head()

Unnamed: 0,obs,job_title,job_posted_date,job_state,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,...,job_desc_291,job_desc_292,job_desc_293,job_desc_294,job_desc_295,job_desc_296,job_desc_297,job_desc_298,job_desc_299,job_desc_300
0,1281,Others,2024/06,CA,A,0.6473,False,False,True,True,...,-0.054078,-0.573635,-0.306883,-0.325092,0.089463,-0.353476,-0.159314,-0.667958,-0.702116,-0.206267
1,1282,Others,2024/08,NY,A,0.4238,True,False,False,False,...,-0.868718,-0.337967,-0.179036,-0.717763,0.404843,0.032468,-0.190448,-1.261702,-0.505897,0.08208
2,1283,Others,2023/01,CA,A,0.6219,True,False,False,True,...,-0.416109,-0.619822,-0.493653,-0.347556,0.071679,-0.331212,-0.381348,-0.50654,-0.773561,-0.105221
3,1284,Job_Title_5,2024/06,NY,A,0.6704,False,False,False,True,...,-0.29756,-0.481448,-0.497642,-0.254823,0.047404,-0.362739,-0.102704,-0.491272,-0.808156,-0.048326
4,1285,Others,2024/05,CA,A,0.731,False,False,False,True,...,-0.176458,-0.726473,-0.323976,-0.145825,-0.046866,-0.229873,-0.568318,-0.614605,-0.770506,0.14214


In [889]:
df_train = df_train.convert_dtypes()
df_test = df_test.convert_dtypes()

In [890]:
# handle missing values with simple imputation
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1280 entries, 0 to 1279
Columns: 317 entries, obs to job_desc_300
dtypes: Float64(301), Int64(2), boolean(9), string(5)
memory usage: 3.4 MB


In [891]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 854 entries, 0 to 853
Columns: 316 entries, obs to job_desc_300
dtypes: Float64(301), Int64(2), boolean(9), string(4)
memory usage: 2.3 MB


In [892]:
len(df_train.columns), len(df_test.columns)

(317, 316)

In [893]:
df_train.isnull().sum().sum()

np.int64(474)

In [894]:
df_test.isnull().sum().sum()

np.int64(327)

In [895]:
# detect columns with missing values
df_train_missing_cols= df_train.columns[df_train.isnull().any()].tolist()
df_test_missing_cols = df_test.columns[df_test.isnull().any()].tolist()

In [896]:
df_train_missing_cols

['job_posted_date', 'job_state', 'feature_10']

In [897]:
df_test_missing_cols

['job_state', 'feature_10']

In [898]:
# convert job_posted_date to datetime
df_train['job_posted_date'] = pd.to_datetime(df_train['job_posted_date'], errors='coerce')
df_test['job_posted_date'] = pd.to_datetime(df_test['job_posted_date'], errors='coerce')

  df_train['job_posted_date'] = pd.to_datetime(df_train['job_posted_date'], errors='coerce')
  df_test['job_posted_date'] = pd.to_datetime(df_test['job_posted_date'], errors='coerce')


In [899]:
# convert to linux time df_train['job_posted_date']
df_train['job_posted_date'] = df_train['job_posted_date'].astype(np.int64) // 10**9
df_test['job_posted_date'] = df_test['job_posted_date'].astype(np.int64) // 10**9

In [900]:
df_train['job_posted_date']

0       1719792000
1       1719792000
2       1719792000
3       1719792000
4       1719792000
           ...    
1275    1717200000
1276    1719792000
1277    1722470400
1278    1719792000
1279    1719792000
Name: job_posted_date, Length: 1280, dtype: int64

In [901]:
mean = df_train['job_posted_date'].mean()
# fill missing values with mean for df_train['job_posted_date']
df_train['job_posted_date'].fillna(mean, inplace=True)
# fill missing values with mean for df_test['job_posted_date']
df_test['job_posted_date'].fillna(mean, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['job_posted_date'].fillna(mean, inplace=True)


In [902]:
mode = df_train['job_state'].mode()[0]
# fill missing values with mode for job_state
df_train['job_state'].fillna(mode, inplace=True)
# fill missing values with mode for job_state
df_test['job_state'].fillna(mode, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['job_state'].fillna(mode, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['job_state'].fillna(mode, inplace=True)


In [903]:
df_train['job_state']

0       NY
1       CA
2       CA
3       CA
4       CA
        ..
1275    CA
1276    CA
1277    NY
1278    CA
1279    NY
Name: job_state, Length: 1280, dtype: string

In [904]:
df_test['job_state']

0      CA
1      NY
2      CA
3      NY
4      CA
       ..
849    CA
850    CA
851    IL
852    DC
853    CA
Name: job_state, Length: 854, dtype: string

In [905]:
df_train['feature_10']

0         60
1         60
2         36
3         72
4         48
        ... 
1275    <NA>
1276    <NA>
1277    <NA>
1278    <NA>
1279      36
Name: feature_10, Length: 1280, dtype: Int64

In [906]:
mean = int(df_train['feature_10'].mean())
# fill missing values with mean for feature_10
df_train['feature_10'].fillna(mean, inplace=True)
# fill missing values with mean for feature_10
df_test['feature_10'].fillna(mean, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['feature_10'].fillna(mean, inplace=True)


In [907]:
df_train.isnull().sum().sum()

np.int64(0)

In [908]:
df_test.isnull().sum().sum()

np.int64(0)

In [909]:
df_train.duplicated().sum()


np.int64(0)

In [910]:
df_test.duplicated().sum()

np.int64(0)

In [912]:
df_train_string = df_train.select_dtypes(include=['string']).drop(columns=['salary_category'])
df_test_string = df_test.select_dtypes(include=['string'])
# identify all unique values in df_train_string and df_test_string
unique_train_values = {col: df_train_string[col].unique() for col in df_train_string.columns}
unique_test_values = {col: df_test_string[col].unique() for col in df_test_string.columns}

In [913]:
unique_train_values

{'job_title': <StringArray>
 [      'Others',  'Job_Title_1',  'Job_Title_2',  'Job_Title_3',
   'Job_Title_4',  'Job_Title_5',  'Job_Title_6',  'Job_Title_7',
   'Job_Title_8',  'Job_Title_9', 'Job_Title_10', 'Job_Title_11',
  'Job_Title_12', 'Job_Title_13', 'Job_Title_14', 'Job_Title_15',
  'Job_Title_16', 'Job_Title_17', 'Job_Title_18', 'Job_Title_19',
  'Job_Title_20', 'Job_Title_21', 'Job_Title_22', 'Job_Title_23',
  'Job_Title_24', 'Job_Title_25', 'Job_Title_26', 'Job_Title_27']
 Length: 28, dtype: string,
 'job_state': <StringArray>
 ['NY', 'CA', 'WA', 'NC', 'KY', 'DC', 'NJ', 'TX', 'FL', 'MA', 'VA', 'AZ', 'GA',
  'IL', 'IN', 'MD', 'TN', 'CO', 'CT', 'SD', 'MI', 'MN', 'PA', 'AK', 'UT', 'OH',
  'SC', 'AR', 'OR', 'OK', 'LA', 'NM', 'NV', 'IA', 'MO', 'AL']
 Length: 36, dtype: string,
 'feature_1': <StringArray>
 ['A', 'B', 'C', 'E', 'D']
 Length: 5, dtype: string}

In [914]:
unique_test_values

{'job_title': <StringArray>
 [      'Others',  'Job_Title_5', 'Job_Title_10', 'Job_Title_13',
  'Job_Title_27',  'Job_Title_7',  'Job_Title_8',  'Job_Title_2',
   'Job_Title_3',  'Job_Title_9',  'Job_Title_1', 'Job_Title_19',
  'Job_Title_23',  'Job_Title_6', 'Job_Title_15', 'Job_Title_22',
  'Job_Title_24', 'Job_Title_12', 'Job_Title_11', 'Job_Title_21',
  'Job_Title_17', 'Job_Title_14', 'Job_Title_25', 'Job_Title_20',
  'Job_Title_18', 'Job_Title_16',  'Job_Title_4', 'Job_Title_26']
 Length: 28, dtype: string,
 'job_state': <StringArray>
 ['CA', 'NY', 'VA', 'WA', 'MA', 'UT', 'IN', 'IL', 'TX', 'PA', 'IA', 'KY', 'OH',
  'CO', 'NJ', 'DC', 'AL', 'CT', 'MD', 'OR', 'GA', 'NV', 'MO', 'AZ', 'NC', 'MN',
  'MI', 'WY', 'AR', 'TN', 'SC', 'KS', 'RI', 'FL']
 Length: 34, dtype: string,
 'feature_1': <StringArray>
 ['A', 'B', 'C', 'D', 'E']
 Length: 5, dtype: string}

In [915]:
# merge unique values in unique_test_values and unique_train_values
unique_values = {col: np.union1d(unique_train_values[col], unique_test_values[col]) for col in unique_train_values.keys()}

In [916]:
unique_values

{'job_title': array(['Job_Title_1', 'Job_Title_10', 'Job_Title_11', 'Job_Title_12',
        'Job_Title_13', 'Job_Title_14', 'Job_Title_15', 'Job_Title_16',
        'Job_Title_17', 'Job_Title_18', 'Job_Title_19', 'Job_Title_2',
        'Job_Title_20', 'Job_Title_21', 'Job_Title_22', 'Job_Title_23',
        'Job_Title_24', 'Job_Title_25', 'Job_Title_26', 'Job_Title_27',
        'Job_Title_3', 'Job_Title_4', 'Job_Title_5', 'Job_Title_6',
        'Job_Title_7', 'Job_Title_8', 'Job_Title_9', 'Others'],
       dtype=object),
 'job_state': array(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'FL', 'GA', 'IA',
        'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'MI', 'MN', 'MO', 'NC',
        'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD',
        'TN', 'TX', 'UT', 'VA', 'WA', 'WY'], dtype=object),
 'feature_1': array(['A', 'B', 'C', 'D', 'E'], dtype=object)}

In [None]:
# use unique_values for one hot encoding df_train_string and df_test_string
def one_hot_encode(df, unique_values):
    for col in df.columns:
        for value in unique_values[col]:
            df[f'{col}_{value}'] = (df[col] == value).astype(int)
    return df.drop(columns=df.columns[df.isnull().any()])

df_train_encoded = one_hot_encode(df_train_string, unique_values)
df_test_encoded = one_hot_encode(df_test_string, unique_values)

In [918]:
df_train_encoded.drop(columns=['job_title','job_state','feature_1'], inplace=True)

In [919]:
df_test_encoded.drop(columns=['job_title','job_state','feature_1'], inplace=True)

In [920]:
df_train_encoded

Unnamed: 0,job_title_Job_Title_1,job_title_Job_Title_10,job_title_Job_Title_11,job_title_Job_Title_12,job_title_Job_Title_13,job_title_Job_Title_14,job_title_Job_Title_15,job_title_Job_Title_16,job_title_Job_Title_17,job_title_Job_Title_18,...,job_state_TX,job_state_UT,job_state_VA,job_state_WA,job_state_WY,feature_1_A,feature_1_B,feature_1_C,feature_1_D,feature_1_E
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1275,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1276,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1277,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1278,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [921]:
df_test_encoded

Unnamed: 0,job_title_Job_Title_1,job_title_Job_Title_10,job_title_Job_Title_11,job_title_Job_Title_12,job_title_Job_Title_13,job_title_Job_Title_14,job_title_Job_Title_15,job_title_Job_Title_16,job_title_Job_Title_17,job_title_Job_Title_18,...,job_state_TX,job_state_UT,job_state_VA,job_state_WA,job_state_WY,feature_1_A,feature_1_B,feature_1_C,feature_1_D,feature_1_E
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
849,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
850,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
851,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
852,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
df_num = df_train.select_dtypes(include=['int64', 'float64'])
df_bool = df_train.select_dtypes(include=['bool'])
df_string = df_train.select_dtypes(include=['string'])
#drop column salary_category from df_string
df_string = df_string.drop(columns=['salary_category'])
df_string.head()

Unnamed: 0,job_title,job_state,feature_1
0,Others,NY,A
1,Job_Title_1,CA,A
2,Others,CA,A
3,Others,CA,A
4,Others,CA,A


In [None]:
# convert catoregical columns to one-hot encoding
df_cat = pd.get_dummies(df_string, drop_first=True)

In [924]:
y = pd.get_dummies(df_train['salary_category'])
y = y.astype(int)
y

Unnamed: 0,High,Low,Medium
0,1,0,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
...,...,...,...
1275,1,0,0
1276,1,0,0
1277,0,0,1
1278,0,0,1


In [927]:
#concatenate numerical, categorical and boolean columns
df_train = pd.concat([df_num, df_bool, df_train_encoded, y], axis=1)

In [None]:
df_num = df_test.select_dtypes(include=['int64', 'float64'])
df_bool = df_test.select_dtypes(include=['bool'])

In [932]:
#concatenate numerical, categorical and boolean columns
df_test = pd.concat([df_num, df_bool, df_test_encoded], axis=1)

In [934]:
X = df_train.drop(columns=['High', 'Low', 'Medium'])
y = df_train[['High', 'Low', 'Medium']]

In [935]:
# split train valid
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      test_size=0.2,
                                                      random_state=42)

In [936]:
# normlaize numerical columns
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_valid[num_cols] = scaler.transform(X_valid[num_cols])
X_test = df_test.copy()
X_test[num_cols] = scaler.transform(X_test[num_cols])


In [937]:
X_train = X_train.astype(np.float32)
X_valid = X_valid.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.astype(np.float32)
y_valid = y_valid.astype(np.float32)

In [938]:
X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape

((1024, 385), (256, 385), (854, 385), (1024, 3), (256, 3))

In [None]:
# train neural network with pytorch with early stopping with train and valid split 
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler

class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 3) 

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x
def train_model(model, X_train, y_train, X_valid, y_valid, num_epochs=100, batch_size=32, patience=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).to(device)
    X_valid_tensor = torch.tensor(X_valid.values, dtype=torch.float32).to(device)
    y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.float32).to(device)

    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            val_outputs = model(X_valid_tensor)
            val_loss = criterion(val_outputs, y_valid_tensor)

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            best_model_state = model.state_dict()
        else:
            epochs_no_improve += 1

        if epochs_no_improve >= patience:
            print("Early stopping triggered")
            break

    model.load_state_dict(best_model_state)
    return model
# Initialize and train the model
input_dim = X_train.shape[1]
model = SimpleNN(input_dim)
trained_model = train_model(model, X_train, y_train, X_valid, y_valid, num_epochs=100, batch_size=32, patience=10)
# Evaluate the model for test set
def evaluate_model(model, X_test):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    model.eval()
    with torch.no_grad():
        X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)
        outputs = model(X_test_tensor)
        probabilities = torch.sigmoid(outputs).cpu().numpy()
    
    return probabilities


Epoch 1/100, Loss: 0.7154, Val Loss: 0.6983
Epoch 2/100, Loss: 0.6963, Val Loss: 0.6839
Epoch 3/100, Loss: 0.6799, Val Loss: 0.6702
Epoch 4/100, Loss: 0.6643, Val Loss: 0.6567
Epoch 5/100, Loss: 0.6490, Val Loss: 0.6434
Epoch 6/100, Loss: 0.6340, Val Loss: 0.6307
Epoch 7/100, Loss: 0.6197, Val Loss: 0.6190
Epoch 8/100, Loss: 0.6064, Val Loss: 0.6085
Epoch 9/100, Loss: 0.5945, Val Loss: 0.5995
Epoch 10/100, Loss: 0.5840, Val Loss: 0.5921
Epoch 11/100, Loss: 0.5748, Val Loss: 0.5861
Epoch 12/100, Loss: 0.5668, Val Loss: 0.5813
Epoch 13/100, Loss: 0.5599, Val Loss: 0.5775
Epoch 14/100, Loss: 0.5538, Val Loss: 0.5743
Epoch 15/100, Loss: 0.5481, Val Loss: 0.5713
Epoch 16/100, Loss: 0.5424, Val Loss: 0.5679
Epoch 17/100, Loss: 0.5363, Val Loss: 0.5641
Epoch 18/100, Loss: 0.5298, Val Loss: 0.5599
Epoch 19/100, Loss: 0.5227, Val Loss: 0.5553
Epoch 20/100, Loss: 0.5154, Val Loss: 0.5506
Epoch 21/100, Loss: 0.5078, Val Loss: 0.5458
Epoch 22/100, Loss: 0.5002, Val Loss: 0.5413
Epoch 23/100, Loss:

In [940]:
# evalute the mode on thest set
test_probabilities = evaluate_model(trained_model, X_test)

In [942]:
test_probabilities_df = pd.DataFrame(test_probabilities, columns=['High', 'Low', 'Medium'])
# change to one column with text High, Low, Medium and start with index obs 1281
test_probabilities_df = test_probabilities_df.idxmax(axis=1).reset_index(drop=True)
# add column obs starting with 1281 and salary_category
test_probabilities_df = pd.DataFrame({
    'obs': range(1281, 1281 + len(test_probabilities_df)),
    'salary_category': test_probabilities_df
})

In [944]:
# replace column obs with index starting from 1281
test_probabilities_df['obs'] = test_probabilities_df.index + 1281

In [946]:
# drop index column and set obs as index
test_probabilities_df.set_index('obs', inplace=True)

In [947]:
test_probabilities_df

Unnamed: 0_level_0,salary_category
obs,Unnamed: 1_level_1
1281,High
1282,Medium
1283,High
1284,Low
1285,High
...,...
2130,Low
2131,Medium
2132,Medium
2133,Low


In [948]:
# save solution to csv
test_probabilities_df.to_csv('../code/solution.csv', index=True, header=True)