# Importing Libraries

In [23]:
import pandas as pd
import copy
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Loading train data

In [24]:
train_data = pd.read_excel("Train Data.xlsx")

# Data Inspection

In [25]:
train_data.head()

Unnamed: 0,First Name,Email ID,Quantity,Price Tier,Ticket Type,Attendee #,Group,Order Type,Currency,Total Paid,...,Attendee Status,College Name,How did you come to know about this event?,"Specify in ""Others"" (how did you come to know about this event)",Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge,Placement Status
0,ANIKET,aniket@xyz.com,1,,Art of Resume Building,2213855000.0,,Free Order,USD,0,...,Attending,D Y PATIL INSTITUTE OF MCA AND MANAGEMENT AKUR...,Email,,Students,,6.7,2,5,Placed
1,Dhanshree,dhanshree@xyz.com,1,,Art of Resume Building,2213859000.0,,Free Order,USD,0,...,Attending,AP SHAH INSTITUTE OF TECHNOLOGY,Others,College,Students,,8.2,3,2,Not placed
2,Dhiraj,dhiraj@xyz.com,1,,Art of Resume Building,2213862000.0,,Free Order,USD,0,...,Attending,Don Bosco College of Engineering Fatorda Goa,Email,,Students,,6.5,4,3,Not placed
3,Pooja,pooja@xyz.com,1,,Art of Resume Building,2213988000.0,,Free Order,USD,0,...,Attending,Pillai College of Engineering New Panvel,Email,,Students,,8.7,2,5,Not placed
4,Aayush,aayush@xyz.com,1,,Art of Resume Building,2214567000.0,,Free Order,USD,0,...,Attending,St Xavier's College,Instagram | LinkedIn | Cloud Counselage Website,,Students,,9.1,3,5,Placed


In [26]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4894 entries, 0 to 4893
Data columns (total 23 columns):
 #   Column                                                           Non-Null Count  Dtype  
---  ------                                                           --------------  -----  
 0   First Name                                                       4894 non-null   object 
 1   Email ID                                                         4894 non-null   object 
 2   Quantity                                                         4894 non-null   int64  
 3   Price Tier                                                       0 non-null      float64
 4   Ticket Type                                                      4894 non-null   object 
 5   Attendee #                                                       4490 non-null   float64
 6   Group                                                            0 non-null      float64
 7   Order Type                                

# Identifing and locating missing values

In [27]:
missing_data = train_data.isnull().sum()

In [28]:
missing_data

First Name                                                            0
Email ID                                                              0
Quantity                                                              0
Price Tier                                                         4894
Ticket Type                                                           0
Attendee #                                                          404
Group                                                              4894
Order Type                                                            0
Currency                                                            404
Total Paid                                                            0
Fees Paid                                                           404
Eventbrite Fees                                                       0
Eventbrite Payment Processing                                         0
Attendee Status                                                 

# Cleaning the data using emails

In [29]:
train_data = train_data.drop_duplicates(subset=["Email ID"])

In [30]:
train_data.head()

Unnamed: 0,First Name,Email ID,Quantity,Price Tier,Ticket Type,Attendee #,Group,Order Type,Currency,Total Paid,...,Attendee Status,College Name,How did you come to know about this event?,"Specify in ""Others"" (how did you come to know about this event)",Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge,Placement Status
0,ANIKET,aniket@xyz.com,1,,Art of Resume Building,2213855000.0,,Free Order,USD,0,...,Attending,D Y PATIL INSTITUTE OF MCA AND MANAGEMENT AKUR...,Email,,Students,,6.7,2,5,Placed
1,Dhanshree,dhanshree@xyz.com,1,,Art of Resume Building,2213859000.0,,Free Order,USD,0,...,Attending,AP SHAH INSTITUTE OF TECHNOLOGY,Others,College,Students,,8.2,3,2,Not placed
2,Dhiraj,dhiraj@xyz.com,1,,Art of Resume Building,2213862000.0,,Free Order,USD,0,...,Attending,Don Bosco College of Engineering Fatorda Goa,Email,,Students,,6.5,4,3,Not placed
3,Pooja,pooja@xyz.com,1,,Art of Resume Building,2213988000.0,,Free Order,USD,0,...,Attending,Pillai College of Engineering New Panvel,Email,,Students,,8.7,2,5,Not placed
4,Aayush,aayush@xyz.com,1,,Art of Resume Building,2214567000.0,,Free Order,USD,0,...,Attending,St Xavier's College,Instagram | LinkedIn | Cloud Counselage Website,,Students,,9.1,3,5,Placed


# Creating deep copies of your dataframe

In [31]:
train_data_backup = copy.deepcopy(train_data)

In [32]:
train_data_backup.head()

Unnamed: 0,First Name,Email ID,Quantity,Price Tier,Ticket Type,Attendee #,Group,Order Type,Currency,Total Paid,...,Attendee Status,College Name,How did you come to know about this event?,"Specify in ""Others"" (how did you come to know about this event)",Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge,Placement Status
0,ANIKET,aniket@xyz.com,1,,Art of Resume Building,2213855000.0,,Free Order,USD,0,...,Attending,D Y PATIL INSTITUTE OF MCA AND MANAGEMENT AKUR...,Email,,Students,,6.7,2,5,Placed
1,Dhanshree,dhanshree@xyz.com,1,,Art of Resume Building,2213859000.0,,Free Order,USD,0,...,Attending,AP SHAH INSTITUTE OF TECHNOLOGY,Others,College,Students,,8.2,3,2,Not placed
2,Dhiraj,dhiraj@xyz.com,1,,Art of Resume Building,2213862000.0,,Free Order,USD,0,...,Attending,Don Bosco College of Engineering Fatorda Goa,Email,,Students,,6.5,4,3,Not placed
3,Pooja,pooja@xyz.com,1,,Art of Resume Building,2213988000.0,,Free Order,USD,0,...,Attending,Pillai College of Engineering New Panvel,Email,,Students,,8.7,2,5,Not placed
4,Aayush,aayush@xyz.com,1,,Art of Resume Building,2214567000.0,,Free Order,USD,0,...,Attending,St Xavier's College,Instagram | LinkedIn | Cloud Counselage Website,,Students,,9.1,3,5,Placed


# Selecting only useful columns

In [33]:
useful_columns = ["Ticket Type", "Attendee #", "Attendee Status", "College Name",
                  "Designation", "Year of Graduation", "CGPA", "Speaking Skills", 
                  "ML Knowledge", "Placement Status"]

train_data = train_data[useful_columns]

In [34]:
train_data.head()

Unnamed: 0,Ticket Type,Attendee #,Attendee Status,College Name,Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge,Placement Status
0,Art of Resume Building,2213855000.0,Attending,D Y PATIL INSTITUTE OF MCA AND MANAGEMENT AKUR...,Students,,6.7,2,5,Placed
1,Art of Resume Building,2213859000.0,Attending,AP SHAH INSTITUTE OF TECHNOLOGY,Students,,8.2,3,2,Not placed
2,Art of Resume Building,2213862000.0,Attending,Don Bosco College of Engineering Fatorda Goa,Students,,6.5,4,3,Not placed
3,Art of Resume Building,2213988000.0,Attending,Pillai College of Engineering New Panvel,Students,,8.7,2,5,Not placed
4,Art of Resume Building,2214567000.0,Attending,St Xavier's College,Students,,9.1,3,5,Placed


# Imputation for numerical columns

In [35]:
numerical_columns = train_data.select_dtypes(include=['int64', 'float64']).columns
numerical_imputer = SimpleImputer(strategy='mean')
train_data[numerical_columns] = numerical_imputer.fit_transform(train_data[numerical_columns])

# Data Scaling and Transformation

In [36]:
scaler = StandardScaler()
train_data[numerical_columns] = scaler.fit_transform(train_data[numerical_columns])

# Converting categorical columns to numeric using Label Encoding

In [37]:
categorical_columns = ["Ticket Type", "Attendee Status",
                       "College Name", "Designation", "Year of Graduation",
                       "Placement Status"]

for column in categorical_columns:
    le = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column])

In [38]:
train_data[column]

0       1
1       0
2       0
3       0
4       1
       ..
4829    2
4831    2
4832    2
4834    2
4836    2
Name: Placement Status, Length: 1987, dtype: int32

In [39]:
train_data

Unnamed: 0,Ticket Type,Attendee #,Attendee Status,College Name,Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge,Placement Status
0,0,-0.376180,0,5,32,17,-1.320958,-1.160881,1.326843,1
1,0,-0.376175,0,2,32,17,0.172014,-0.330247,-1.175312,0
2,0,-0.376171,0,7,32,17,-1.520021,0.500387,-0.341260,0
3,0,-0.376003,0,16,32,17,0.669671,-1.160881,1.326843,0
4,0,-0.375231,0,20,32,17,1.067796,-0.330247,1.326843,1
...,...,...,...,...,...,...,...,...,...,...
4829,15,-0.352786,0,44,32,17,1.067796,1.331021,0.492791,2
4831,15,-0.352744,0,52,32,17,-0.624238,-0.330247,-1.175312,2
4832,15,-0.352733,0,36,32,17,-0.723769,0.500387,0.492791,2
4834,15,-0.352678,0,53,32,17,1.167328,0.500387,-1.175312,2


# Separating features (x_train) and target (y_train)

In [40]:
x = train_data.drop("Placement Status", axis=1)
y = train_data["Placement Status"]

In [41]:
x

Unnamed: 0,Ticket Type,Attendee #,Attendee Status,College Name,Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge
0,0,-0.376180,0,5,32,17,-1.320958,-1.160881,1.326843
1,0,-0.376175,0,2,32,17,0.172014,-0.330247,-1.175312
2,0,-0.376171,0,7,32,17,-1.520021,0.500387,-0.341260
3,0,-0.376003,0,16,32,17,0.669671,-1.160881,1.326843
4,0,-0.375231,0,20,32,17,1.067796,-0.330247,1.326843
...,...,...,...,...,...,...,...,...,...
4829,15,-0.352786,0,44,32,17,1.067796,1.331021,0.492791
4831,15,-0.352744,0,52,32,17,-0.624238,-0.330247,-1.175312
4832,15,-0.352733,0,36,32,17,-0.723769,0.500387,0.492791
4834,15,-0.352678,0,53,32,17,1.167328,0.500387,-1.175312


In [42]:
y

0       1
1       0
2       0
3       0
4       1
       ..
4829    2
4831    2
4832    2
4834    2
4836    2
Name: Placement Status, Length: 1987, dtype: int32

# Spliting the data into train and validation sets

In [43]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

# Model that can handle missing values natively

In [45]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

# Loading test data

In [46]:
test_data = pd.read_excel("Test Data.xlsx")

# Data Inspection

In [47]:
test_data.head()

Unnamed: 0,First Name,Email ID,Quantity,Price Tier,Ticket Type,Attendee #,Group,Order Type,Currency,Total Paid,...,Attendee Status,College Name,How did you come to know about this event?,"Specify in ""Others"" (how did you come to know about this event)",Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge,Placement Status
0,Sahil,sahil@xyz.com,1,,Hello ML and DL,2293940000.0,,Free Order,USD,0,...,Attending,"symbiosis institute of technology, pune",Whatsapp,,Students,,7.8,3,3,
1,Amrita,amrita@xyz.com,1,,Hello ML and DL,2293941000.0,,Free Order,USD,0,...,Attending,"mit academy of engineering ,alandi",Whatsapp,,Students,,9.1,3,3,
2,Mamta,mamta@xyz.com,1,,Hello ML and DL,2293941000.0,,Free Order,USD,0,...,Attending,a. c. patil college of engineering,Whatsapp,,Students,,6.9,2,2,
3,Bhagyashri,bhagyashri@xyz.com,1,,Hello ML and DL,2293946000.0,,Free Order,USD,0,...,Attending,wilson college,Others,,Students,,8.4,4,4,
4,Divyanshu,divyanshu@xyz.com,1,,Hello ML and DL,2293956000.0,,Free Order,USD,0,...,Attending,"ld college of engineering, ahmedabad, gujarat",Whatsapp,,Students,,6.7,5,5,


In [48]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3796 entries, 0 to 3795
Data columns (total 23 columns):
 #   Column                                                           Non-Null Count  Dtype  
---  ------                                                           --------------  -----  
 0   First Name                                                       3796 non-null   object 
 1   Email ID                                                         3796 non-null   object 
 2   Quantity                                                         3796 non-null   int64  
 3   Price Tier                                                       0 non-null      float64
 4   Ticket Type                                                      3796 non-null   object 
 5   Attendee #                                                       3794 non-null   float64
 6   Group                                                            0 non-null      float64
 7   Order Type                                

# Identifing and locating missing values

In [49]:
missing_data = test_data.isnull().sum()

In [50]:
missing_data

First Name                                                            0
Email ID                                                              0
Quantity                                                              0
Price Tier                                                         3796
Ticket Type                                                           0
Attendee #                                                            2
Group                                                              3796
Order Type                                                            0
Currency                                                              2
Total Paid                                                            0
Fees Paid                                                             2
Eventbrite Fees                                                       0
Eventbrite Payment Processing                                         0
Attendee Status                                                 

# Cleaning the data using emails

In [51]:
test_data = test_data.drop_duplicates(subset=["Email ID"])

In [52]:
test_data.head()

Unnamed: 0,First Name,Email ID,Quantity,Price Tier,Ticket Type,Attendee #,Group,Order Type,Currency,Total Paid,...,Attendee Status,College Name,How did you come to know about this event?,"Specify in ""Others"" (how did you come to know about this event)",Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge,Placement Status
0,Sahil,sahil@xyz.com,1,,Hello ML and DL,2293940000.0,,Free Order,USD,0,...,Attending,"symbiosis institute of technology, pune",Whatsapp,,Students,,7.8,3,3,
1,Amrita,amrita@xyz.com,1,,Hello ML and DL,2293941000.0,,Free Order,USD,0,...,Attending,"mit academy of engineering ,alandi",Whatsapp,,Students,,9.1,3,3,
2,Mamta,mamta@xyz.com,1,,Hello ML and DL,2293941000.0,,Free Order,USD,0,...,Attending,a. c. patil college of engineering,Whatsapp,,Students,,6.9,2,2,
3,Bhagyashri,bhagyashri@xyz.com,1,,Hello ML and DL,2293946000.0,,Free Order,USD,0,...,Attending,wilson college,Others,,Students,,8.4,4,4,
4,Divyanshu,divyanshu@xyz.com,1,,Hello ML and DL,2293956000.0,,Free Order,USD,0,...,Attending,"ld college of engineering, ahmedabad, gujarat",Whatsapp,,Students,,6.7,5,5,


# Creating deep copies of your dataframe

In [53]:
test_data_backup = copy.deepcopy(test_data)

In [54]:
test_data_backup.head()

Unnamed: 0,First Name,Email ID,Quantity,Price Tier,Ticket Type,Attendee #,Group,Order Type,Currency,Total Paid,...,Attendee Status,College Name,How did you come to know about this event?,"Specify in ""Others"" (how did you come to know about this event)",Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge,Placement Status
0,Sahil,sahil@xyz.com,1,,Hello ML and DL,2293940000.0,,Free Order,USD,0,...,Attending,"symbiosis institute of technology, pune",Whatsapp,,Students,,7.8,3,3,
1,Amrita,amrita@xyz.com,1,,Hello ML and DL,2293941000.0,,Free Order,USD,0,...,Attending,"mit academy of engineering ,alandi",Whatsapp,,Students,,9.1,3,3,
2,Mamta,mamta@xyz.com,1,,Hello ML and DL,2293941000.0,,Free Order,USD,0,...,Attending,a. c. patil college of engineering,Whatsapp,,Students,,6.9,2,2,
3,Bhagyashri,bhagyashri@xyz.com,1,,Hello ML and DL,2293946000.0,,Free Order,USD,0,...,Attending,wilson college,Others,,Students,,8.4,4,4,
4,Divyanshu,divyanshu@xyz.com,1,,Hello ML and DL,2293956000.0,,Free Order,USD,0,...,Attending,"ld college of engineering, ahmedabad, gujarat",Whatsapp,,Students,,6.7,5,5,


# Selecting only useful columns

In [55]:
useful_columns = ["Ticket Type", "Attendee #", "Attendee Status", "College Name",
                  "Designation", "Year of Graduation", "CGPA", "Speaking Skills", "ML Knowledge"]

test_data = test_data[useful_columns]

In [56]:
test_data.head()

Unnamed: 0,Ticket Type,Attendee #,Attendee Status,College Name,Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge
0,Hello ML and DL,2293940000.0,Attending,"symbiosis institute of technology, pune",Students,,7.8,3,3
1,Hello ML and DL,2293941000.0,Attending,"mit academy of engineering ,alandi",Students,,9.1,3,3
2,Hello ML and DL,2293941000.0,Attending,a. c. patil college of engineering,Students,,6.9,2,2
3,Hello ML and DL,2293946000.0,Attending,wilson college,Students,,8.4,4,4
4,Hello ML and DL,2293956000.0,Attending,"ld college of engineering, ahmedabad, gujarat",Students,,6.7,5,5


# Imputation for numerical columns

In [57]:
numerical_columns = test_data.select_dtypes(include=['int64', 'float64']).columns
numerical_imputer = SimpleImputer(strategy='mean')
test_data[numerical_columns] = numerical_imputer.fit_transform(test_data[numerical_columns])

# Data Scaling and Transformation

In [58]:
scaler = StandardScaler()
test_data[numerical_columns] = scaler.fit_transform(test_data[numerical_columns])

# Converting categorical columns to numeric using Label Encoding

In [59]:
categorical_columns = ["Ticket Type", "Attendee Status",
                       "College Name", "Designation", "Year of Graduation"
                       ]

for column in categorical_columns:
    le = LabelEncoder()
    test_data[column] = le.fit_transform(test_data[column])

In [60]:
test_data[column]

0       22
1       22
2       22
3       22
4       22
        ..
3685    22
3687    22
3692    22
3693    22
3695    22
Name: Year of Graduation, Length: 2321, dtype: int32

In [61]:
test_data

Unnamed: 0,Ticket Type,Attendee #,Attendee Status,College Name,Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge
0,0,-0.269321,0,49,46,22,-0.233957,-0.254328,-0.240896
1,0,-0.269321,0,41,46,22,1.071365,-0.254328,-0.240896
2,0,-0.269320,0,27,46,22,-1.137641,-1.076466,-1.061922
3,0,-0.269316,0,53,46,22,0.368499,0.567810,0.580131
4,0,-0.269304,0,39,46,22,-1.338460,1.389948,1.401157
...,...,...,...,...,...,...,...,...,...
3685,11,-0.339259,0,34,46,22,-0.033138,-0.254328,-0.240896
3687,11,-0.338720,0,47,46,22,0.468909,1.389948,1.401157
3692,11,-0.338606,0,33,46,22,0.770137,-0.254328,-0.240896
3693,11,-0.338598,0,39,46,22,1.272184,0.567810,0.580131


# Model Evaluation

In [62]:
y_pred = model.predict(x_val)
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.8467336683417085


# Calculate precision and recall

In [63]:
precision = precision_score(y_val, y_pred, average='weighted')
recall = recall_score(y_val, y_pred, average='weighted')
print("Validation Precision:", precision)
print("Validation Recall:", recall)

Validation Precision: 0.8222535602915765
Validation Recall: 0.8467336683417085


# Making predictions on test data

In [64]:
predictions = model.predict(test_data)

In [65]:
predictions

array([0, 0, 0, ..., 2, 2, 2])

In [66]:
y_true_test = test_data_backup["Placement Status"]

In [67]:
test_data_backup["Placement Status"].fillna(0, inplace=True)
y_true_test = test_data_backup["Placement Status"]

In [68]:
accuracy_test = accuracy_score(y_true_test, predictions)
precision_test = precision_score(y_true_test, predictions, average='weighted')
recall_test = recall_score(y_true_test, predictions, average='weighted', zero_division=0)

print("Test Accuracy:", accuracy_test)
print("Test Precision:", precision_test)
print("Test Recall:", recall_test)

Test Accuracy: 0.06204222317966394
Test Precision: 1.0
Test Recall: 0.06204222317966394


# Saving predictions to a separate excel file

In [69]:
output_df = pd.DataFrame({"First Name": test_data_backup["First Name"], "Email ID": test_data_backup["Email ID"], **test_data, "Placement Status": predictions})
output_df.to_excel("Prediction of Placement Status.xlsx", index=False)