In [27]:
import datetime
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
import pickle
import os

csv_file_name = '../Data/final_df.csv'
df = pd.read_csv(csv_file_name, dtype={'Company Size': str})#, skiprows=lambda x: x % 2)

hot_encode_columns = [
    'Lead Job Title',
    'Company Size',
    'Company Industry',
    'Company Li Company Type',
    'Company Location Country Name',
    'Email Status'
]

df_encoded = pd.DataFrame()

current_year = datetime.datetime.now().year
df_encoded['Years Since Company Founded'] = current_year - df['Company Founded In']

# Convert Lead Years and Months to total months for position
df_encoded['Total Months In Position'] = df['Lead Years In Position'] * 12 + df['Lead Months In Position']

# Convert Lead Years and Months to total months for company tenure
df_encoded['Total Months In Company'] = df['Lead Years In Company'] * 12 + df['Lead Months In Company']

for column in hot_encode_columns+['Relevant', 'Company Followers', 'Company Des Relevant Score']:
    df_encoded[column] = df[column]

# Create a label encoder object
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Encode each column in hot_encode_columns with label encoding
for column in hot_encode_columns:
    # Create a new ordinal encoder object for each column
    ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    
    # Extract the column as a DataFrame with a single column
    column_data = df_encoded[[column]]
    
    # Fit and transform the data using the encoder
    encoded_column = ordinal_encoder.fit_transform(column_data)
    
    # Save the ordinal encoder to a file in the 'encoders' directory
    encoder_filename = os.path.join('encoders', f'{column}_encoder.pkl')
    with open(encoder_filename, 'wb') as file:
        pickle.dump(ordinal_encoder, file)

    # Replace the column in df_encoded with the encoded values
    df_encoded[column] = encoded_column

df_encoded.fillna(0, inplace=True)

df_encoded['Relevant'] = df_encoded['Relevant'].replace(2, 1)

In [29]:
import datetime
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


# Define features (X) and target variable (y)
X = df_encoded.drop(columns=['Relevant'])
y = df_encoded['Relevant']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle class imbalance using upsampling of minority class (1)
X_train_balanced, y_train_balanced = resample(X_train[y_train == 1],
                                              y_train[y_train == 1],
                                              replace=True,
                                              n_samples=sum(y_train == 0),
                                              random_state=39)

X_train_balanced = pd.concat([X_train[y_train == 0], X_train_balanced])
y_train_balanced = pd.concat([y_train[y_train == 0], y_train_balanced])

In [34]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import confusion_matrix
import numpy as np

X_train_balanced_numeric = X_train_balanced.apply(pd.to_numeric, errors='coerce')
X_train_balanced_numeric.dropna(inplace=True)
X_train_array = X_train_balanced_numeric.values.astype(np.float32)

X_train_tensor = torch.tensor(X_train_array)
y_train_tensor = torch.tensor(y_train_balanced.values, dtype=torch.int64)

X_test_numeric = X_test.apply(pd.to_numeric, errors='coerce')
X_test_numeric.fillna(0, inplace=True)
X_test_array = X_test_numeric.values.astype(np.float32)

X_test_tensor = torch.tensor(X_test_array)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.int64)

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.layer1 = nn.Linear(11, 16)
        self.layer2 = nn.Linear(16, 16)
        self.layer3 = nn.Linear(16, 16)
        self.layer4 = nn.Linear(16, 16)
        self.layer5 = nn.Linear(16, 16)
        self.layer6 = nn.Linear(16, 16)
        self.layer7 = nn.Linear(16, 16)
        self.output_layer = nn.Linear(16, 2)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        x = self.relu(self.layer4(x))
        x = self.relu(self.layer5(x))
        x = self.relu(self.layer6(x))
        x = self.relu(self.layer7(x))
        x = self.softmax(self.output_layer(x))
        return x

model = NeuralNetwork()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0004)

for epoch in range(300):
    optimizer.zero_grad()
    output = model(X_train_tensor)
    loss = criterion(output, y_train_tensor)
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}/300, Loss: {loss.item():.4f}')

with torch.no_grad():
    output = model(X_test_tensor)
    _, y_pred = torch.max(output, 1)

y_pred_numpy = y_pred.numpy()

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_numpy)
print("Confusion Matrix:\n", conf_matrix)

# Calculate classification report
class_report = classification_report(y_test, y_pred_numpy)
print("Classification Report:\n", class_report)

torch.save(model.state_dict(), 'neural_network_model.pth')

Epoch 1/300, Loss: 0.7144
Epoch 2/300, Loss: 0.7095
Epoch 3/300, Loss: 0.7042
Epoch 4/300, Loss: 0.6990
Epoch 5/300, Loss: 0.6940
Epoch 6/300, Loss: 0.6890
Epoch 7/300, Loss: 0.7174
Epoch 8/300, Loss: 0.6864
Epoch 9/300, Loss: 0.6868
Epoch 10/300, Loss: 0.6875
Epoch 11/300, Loss: 0.6882
Epoch 12/300, Loss: 0.6887
Epoch 13/300, Loss: 0.6890
Epoch 14/300, Loss: 0.6891
Epoch 15/300, Loss: 0.6890
Epoch 16/300, Loss: 0.6889
Epoch 17/300, Loss: 0.6886
Epoch 18/300, Loss: 0.6882
Epoch 19/300, Loss: 0.6877
Epoch 20/300, Loss: 0.6871
Epoch 21/300, Loss: 0.6864
Epoch 22/300, Loss: 0.6855
Epoch 23/300, Loss: 0.6846
Epoch 24/300, Loss: 0.6836
Epoch 25/300, Loss: 0.6825
Epoch 26/300, Loss: 0.6813
Epoch 27/300, Loss: 0.6801
Epoch 28/300, Loss: 0.6788
Epoch 29/300, Loss: 0.6775
Epoch 30/300, Loss: 0.6763
Epoch 31/300, Loss: 0.6755
Epoch 32/300, Loss: 0.6756
Epoch 33/300, Loss: 0.6754
Epoch 34/300, Loss: 0.6739
Epoch 35/300, Loss: 0.6729
Epoch 36/300, Loss: 0.6725
Epoch 37/300, Loss: 0.6722
Epoch 38/3

In [36]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np

def load_and_infer(df):
    # Load saved model
    class NeuralNetwork(nn.Module):
        def __init__(self):
            super(NeuralNetwork, self).__init__()
            self.layer1 = nn.Linear(11, 16)
            self.layer2 = nn.Linear(16, 16)
            self.layer3 = nn.Linear(16, 16)
            self.layer4 = nn.Linear(16, 16)
            self.layer5 = nn.Linear(16, 16)
            self.layer6 = nn.Linear(16, 16)
            self.layer7 = nn.Linear(16, 16)
            self.output_layer = nn.Linear(16, 2)
            self.relu = nn.ReLU()
            self.softmax = nn.Softmax(dim=1)

        def forward(self, x):
            x = self.relu(self.layer1(x))
            x = self.relu(self.layer2(x))
            x = self.relu(self.layer3(x))
            x = self.relu(self.layer4(x))
            x = self.relu(self.layer5(x))
            x = self.relu(self.layer6(x))
            x = self.relu(self.layer7(x))
            x = self.softmax(self.output_layer(x))
            return x

    model = NeuralNetwork()
    model.load_state_dict(torch.load('neural_network_model.pth'))
    model.eval()

    # Preprocess dataframe
    df_encoded = pd.DataFrame()

    current_year = datetime.datetime.now().year
    df_encoded['Years Since Company Founded'] = current_year - df['Company Founded In']

    df_encoded['Total Months In Position'] = df['Lead Years In Position'] * 12 + df['Lead Months In Position']

    df_encoded['Total Months In Company'] = df['Lead Years In Company'] * 12 + df['Lead Months In Company']

    for column in hot_encode_columns+['Company Followers', 'Company Des Relevant Score']:
        df_encoded[column] = df[column]

    for column in hot_encode_columns:
        # Load the encoder
        encoder_filename = f'encoders/{column}_encoder.pkl'  # Assuming the encoders are saved in a folder named 'encoders'
        with open(encoder_filename, 'rb') as file:
            encoder = pickle.load(file)
        
        # Encode the column in df_encoded
        encoded_column = encoder.transform(df_encoded[[column]])
        
        # Replace the column in df_encoded with the encoded values
        df_encoded[column] = encoded_column

    df_encoded.fillna(0, inplace=True)

    print(df_encoded.shape)

    # Convert dataframe to tensor
    df_numeric = df_encoded.apply(pd.to_numeric, errors='coerce')
    df_numeric.dropna(inplace=True)
    df_array = df_numeric.values.astype(np.float32)
    df_tensor = torch.tensor(df_array)

    # Perform inference
    with torch.no_grad():
        output = model(df_tensor)
    
    # Convert probabilities to raw probabilities
    probabilities = output.numpy()

    # Create a dataframe with Company Name and Predictions
    company_names = df['Company Name'].iloc[df_numeric.index]
    predictions_df = pd.DataFrame({'Company Name': company_names, 'Predicted_Relevancy': probabilities[:, 1]})

    # Sort by Predicted_Relevancy
    predictions_df = predictions_df.sort_values(by='Predicted_Relevancy', ascending=False)

    predictions_df = predictions_df.reset_index(drop=True)
    
    return predictions_df

# Example usage:
# Assuming df is the dataframe you want to perform inference on
csv_file_name = '../Data/final_df.csv'
df = pd.read_csv(csv_file_name, dtype={'Company Size': str})

predictions_df = load_and_infer(df)
predictions_df = predictions_df.dropna(subset=['Company Name'])
print(predictions_df)

(2613, 11)
            Company Name  Predicted_Relevancy
0                Evernow             0.996795
1                  Round             0.995816
2                Portpro             0.995475
3               Accusoft             0.995378
4     Terlato Wine Group             0.995053
...                  ...                  ...
2608      Morgan Stanley             0.000000
2609          Blackstone             0.000000
2610     Stealth Startup             0.000000
2611           Accenture             0.000000
2612                 Ubs             0.000000

[2608 rows x 2 columns]
