In [4]:
import datetime
import pandas as pd
import os
from sklearn.preprocessing import MinMaxScaler

csv_file_name = '../Data/final_df.csv'
df = pd.read_csv(csv_file_name, dtype={'Company Size': str})#, skiprows=lambda x: x % 2)

hot_encode_columns = [
    'Lead Job Title',
    'Company Size',
    'Company Industry',
    'Company Li Company Type',
    'Company Location Country Name',
    'Email Status'
]

df_encoded = pd.DataFrame()

current_year = datetime.datetime.now().year
df_encoded['Years Since Company Founded'] = current_year - df['Company Founded In']

# Convert Lead Years and Months to total months for position
df_encoded['Total Months In Position'] = df['Lead Years In Position'] * 12 + df['Lead Months In Position']

# Convert Lead Years and Months to total months for company tenure
df_encoded['Total Months In Company'] = df['Lead Years In Company'] * 12 + df['Lead Months In Company']

for column in hot_encode_columns+['Relevant', 'Company Followers', 'Company Des Relevant Score']:
    df_encoded[column] = df[column]

df_encoded = pd.get_dummies(df_encoded, columns=hot_encode_columns)

df_encoded.fillna(0, inplace=True)

df_encoded['Relevant'] = df_encoded['Relevant'].replace(2, 1)

In [5]:
import datetime
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


# Define features (X) and target variable (y)
X = df_encoded.drop(columns=['Relevant'])
y = df_encoded['Relevant']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle class imbalance using upsampling of minority class (1)
X_train_balanced, y_train_balanced = resample(X_train[y_train == 1],
                                              y_train[y_train == 1],
                                              replace=True,
                                              n_samples=sum(y_train == 0),
                                              random_state=39)

X_train_balanced = pd.concat([X_train[y_train == 0], X_train_balanced])
y_train_balanced = pd.concat([y_train[y_train == 0], y_train_balanced])


In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import confusion_matrix
import numpy as np

X_train_balanced_numeric = X_train_balanced.apply(pd.to_numeric, errors='coerce')
X_train_balanced_numeric.dropna(inplace=True)
X_train_array = X_train_balanced_numeric.values.astype(np.float32)

X_train_tensor = torch.tensor(X_train_array)
y_train_tensor = torch.tensor(y_train_balanced.values, dtype=torch.int64)

X_test_numeric = X_test.apply(pd.to_numeric, errors='coerce')
X_test_numeric.fillna(0, inplace=True)
X_test_array = X_test_numeric.values.astype(np.float32)

X_test_tensor = torch.tensor(X_test_array)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.int64)

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.layer1 = nn.Linear(X_train_balanced.shape[1], 16)
        self.layer2 = nn.Linear(16, 16)
        self.layer3 = nn.Linear(16, 16)
        self.layer4 = nn.Linear(16, 16)
        self.layer5 = nn.Linear(16, 16)
        self.layer6 = nn.Linear(16, 16)
        self.layer7 = nn.Linear(16, 16)
        self.output_layer = nn.Linear(16, 2)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        x = self.relu(self.layer4(x))
        x = self.relu(self.layer5(x))
        x = self.relu(self.layer6(x))
        x = self.relu(self.layer7(x))
        x = self.softmax(self.output_layer(x))
        return x

model = NeuralNetwork()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

for epoch in range(300):
    optimizer.zero_grad()
    output = model(X_train_tensor)
    loss = criterion(output, y_train_tensor)
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}/300, Loss: {loss.item():.4f}')

with torch.no_grad():
    output = model(X_test_tensor)
    _, y_pred = torch.max(output, 1)

y_pred_numpy = y_pred.numpy()

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_numpy)
print("Confusion Matrix:\n", conf_matrix)

# Calculate classification report
class_report = classification_report(y_test, y_pred_numpy)
print("Classification Report:\n", class_report)

torch.save(model.state_dict(), 'neural_network_model.pth')

Epoch 1/300, Loss: 0.6834
Epoch 2/300, Loss: 0.6824
Epoch 3/300, Loss: 0.6816
Epoch 4/300, Loss: 0.6810
Epoch 5/300, Loss: 0.6805
Epoch 6/300, Loss: 0.6802
Epoch 7/300, Loss: 0.6801
Epoch 8/300, Loss: 0.6798
Epoch 9/300, Loss: 0.6794
Epoch 10/300, Loss: 0.6791
Epoch 11/300, Loss: 0.6789
Epoch 12/300, Loss: 0.6786
Epoch 13/300, Loss: 0.6783
Epoch 14/300, Loss: 0.6780
Epoch 15/300, Loss: 0.6778
Epoch 16/300, Loss: 0.6776
Epoch 17/300, Loss: 0.6775
Epoch 18/300, Loss: 0.6773
Epoch 19/300, Loss: 0.6772
Epoch 20/300, Loss: 0.6770
Epoch 21/300, Loss: 0.6769
Epoch 22/300, Loss: 0.6767
Epoch 23/300, Loss: 0.6766
Epoch 24/300, Loss: 0.6764
Epoch 25/300, Loss: 0.6763
Epoch 26/300, Loss: 0.6762
Epoch 27/300, Loss: 0.6760
Epoch 28/300, Loss: 0.6759
Epoch 29/300, Loss: 0.6757
Epoch 30/300, Loss: 0.6756
Epoch 31/300, Loss: 0.6754
Epoch 32/300, Loss: 0.6752
Epoch 33/300, Loss: 0.6751
Epoch 34/300, Loss: 0.6749
Epoch 35/300, Loss: 0.6747
Epoch 36/300, Loss: 0.6746
Epoch 37/300, Loss: 0.6744
Epoch 38/3

In [14]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np

def load_and_infer(df):
    # Load saved model
    class NeuralNetwork(nn.Module):
        def __init__(self):
            super(NeuralNetwork, self).__init__()
            self.layer1 = nn.Linear(X_train_balanced.shape[1], 16)
            self.layer2 = nn.Linear(16, 16)
            self.layer3 = nn.Linear(16, 16)
            self.layer4 = nn.Linear(16, 16)
            self.layer5 = nn.Linear(16, 16)
            self.layer6 = nn.Linear(16, 16)
            self.layer7 = nn.Linear(16, 16)
            self.output_layer = nn.Linear(16, 2)
            self.relu = nn.ReLU()
            self.softmax = nn.Softmax(dim=1)

        def forward(self, x):
            x = self.relu(self.layer1(x))
            x = self.relu(self.layer2(x))
            x = self.relu(self.layer3(x))
            x = self.relu(self.layer4(x))
            x = self.relu(self.layer5(x))
            x = self.relu(self.layer6(x))
            x = self.relu(self.layer7(x))
            x = self.softmax(self.output_layer(x))
            return x

    model = NeuralNetwork()
    model.load_state_dict(torch.load('neural_network_model.pth'))
    model.eval()

    # Preprocess dataframe
    df_encoded = pd.DataFrame()

    current_year = datetime.datetime.now().year
    df_encoded['Years Since Company Founded'] = current_year - df['Company Founded In']

    df_encoded['Total Months In Position'] = df['Lead Years In Position'] * 12 + df['Lead Months In Position']

    df_encoded['Total Months In Company'] = df['Lead Years In Company'] * 12 + df['Lead Months In Company']

    for column in hot_encode_columns+['Company Followers', 'Company Des Relevant Score']:
        df_encoded[column] = df[column]

    df_encoded = pd.get_dummies(df_encoded, columns=hot_encode_columns)

    df_encoded.fillna(0, inplace=True)

    # Convert dataframe to tensor
    df_numeric = df_encoded.apply(pd.to_numeric, errors='coerce')
    df_numeric.dropna(inplace=True)
    df_array = df_numeric.values.astype(np.float32)
    df_tensor = torch.tensor(df_array)

    # Perform inference
    with torch.no_grad():
        output = model(df_tensor)
    
    # Convert probabilities to raw probabilities
    probabilities = output.numpy()

    # Create a dataframe with Company Name and Predictions
    company_names = df['Company Name'].iloc[df_numeric.index]
    predictions_df = pd.DataFrame({'Company Name': company_names, 'Predicted_Relevancy': probabilities[:, 1]})

    # Sort by Predicted_Relevancy
    predictions_df = predictions_df.sort_values(by='Predicted_Relevancy', ascending=False)

    predictions_df = predictions_df.reset_index(drop=True)
    
    return predictions_df

# Example usage:
# Assuming df is the dataframe you want to perform inference on
csv_file_name = '../Data/final_df.csv'
df = pd.read_csv(csv_file_name, dtype={'Company Size': str})

predictions_df = load_and_infer(df)
predictions_df = predictions_df.dropna(subset=['Company Name'])
print(predictions_df)

               Company Name  Predicted_Relevancy
0         The Collaborative             0.794061
1                  Accusoft             0.792764
2          Connect Staffing             0.782988
3     Alti Tiedemann Global             0.778712
4                  Petrotel             0.778418
...                     ...                  ...
2608                Netflix             0.000000
2609              Accenture             0.000000
2610              Accenture             0.000000
2611              Accenture             0.000000
2612              Accenture             0.000000

[2608 rows x 2 columns]
