# Importing Libraries

In [1]:
import pandas as pd
import copy
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Loading train data

In [2]:
train_data = pd.read_excel("Train Data.xlsx")

# Data Inspection

In [3]:
train_data.head()

Unnamed: 0,First Name,Email ID,Quantity,Price Tier,Ticket Type,Attendee #,Group,Order Type,Currency,Total Paid,...,Attendee Status,College Name,How did you come to know about this event?,Specify in Others (how did you come to know about this event),Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge,Placement Status
0,ANIKET,aniket@xyz.com,1,,Art of Resume Building,2213855000.0,,Free Order,USD,0,...,Attending,D Y PATIL INSTITUTE OF MCA AND MANAGEMENT AKUR...,Email,,Students,,6.7,2,5,Placed
1,Dhanshree,dhanshree@xyz.com,1,,Art of Resume Building,2213859000.0,,Free Order,USD,0,...,Attending,AP SHAH INSTITUTE OF TECHNOLOGY,Others,College,Students,,8.2,3,2,Not placed
2,Dhiraj,dhiraj@xyz.com,1,,Art of Resume Building,2213862000.0,,Free Order,USD,0,...,Attending,Don Bosco College of Engineering Fatorda Goa,Email,,Students,,6.5,4,3,Not placed
3,Pooja,pooja@xyz.com,1,,Art of Resume Building,2213988000.0,,Free Order,USD,0,...,Attending,Pillai College of Engineering New Panvel,Email,,Students,,8.7,2,5,Not placed
4,Aayush,aayush@xyz.com,1,,Art of Resume Building,2214567000.0,,Free Order,USD,0,...,Attending,St Xavier's College,Instagram | LinkedIn | Cloud Counselage Website,,Students,,9.1,3,5,Placed


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4894 entries, 0 to 4893
Data columns (total 23 columns):
 #   Column                                                         Non-Null Count  Dtype  
---  ------                                                         --------------  -----  
 0   First Name                                                     4894 non-null   object 
 1   Email ID                                                       4894 non-null   object 
 2   Quantity                                                       4894 non-null   int64  
 3   Price Tier                                                     0 non-null      float64
 4   Ticket Type                                                    4894 non-null   object 
 5   Attendee #                                                     4490 non-null   float64
 6   Group                                                          0 non-null      float64
 7   Order Type                                                  

# Identifing and locating missing values

In [5]:
missing_data = train_data.isnull().sum()

In [6]:
missing_data

First Name                                                          0
Email ID                                                            0
Quantity                                                            0
Price Tier                                                       4894
Ticket Type                                                         0
Attendee #                                                        404
Group                                                            4894
Order Type                                                          0
Currency                                                          404
Total Paid                                                          0
Fees Paid                                                         404
Eventite Fees                                                       0
Eventite Payment Processing                                         0
Attendee Status                                                     0
College Name        

# Cleaning the data using emails

In [7]:
train_data = train_data.drop_duplicates(subset=["Email ID"])

In [8]:
train_data.head()

Unnamed: 0,First Name,Email ID,Quantity,Price Tier,Ticket Type,Attendee #,Group,Order Type,Currency,Total Paid,...,Attendee Status,College Name,How did you come to know about this event?,Specify in Others (how did you come to know about this event),Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge,Placement Status
0,ANIKET,aniket@xyz.com,1,,Art of Resume Building,2213855000.0,,Free Order,USD,0,...,Attending,D Y PATIL INSTITUTE OF MCA AND MANAGEMENT AKUR...,Email,,Students,,6.7,2,5,Placed
1,Dhanshree,dhanshree@xyz.com,1,,Art of Resume Building,2213859000.0,,Free Order,USD,0,...,Attending,AP SHAH INSTITUTE OF TECHNOLOGY,Others,College,Students,,8.2,3,2,Not placed
2,Dhiraj,dhiraj@xyz.com,1,,Art of Resume Building,2213862000.0,,Free Order,USD,0,...,Attending,Don Bosco College of Engineering Fatorda Goa,Email,,Students,,6.5,4,3,Not placed
3,Pooja,pooja@xyz.com,1,,Art of Resume Building,2213988000.0,,Free Order,USD,0,...,Attending,Pillai College of Engineering New Panvel,Email,,Students,,8.7,2,5,Not placed
4,Aayush,aayush@xyz.com,1,,Art of Resume Building,2214567000.0,,Free Order,USD,0,...,Attending,St Xavier's College,Instagram | LinkedIn | Cloud Counselage Website,,Students,,9.1,3,5,Placed


# Creating deep copies of your dataframe

In [9]:
train_data_backup = copy.deepcopy(train_data)

In [10]:
train_data_backup.head()

Unnamed: 0,First Name,Email ID,Quantity,Price Tier,Ticket Type,Attendee #,Group,Order Type,Currency,Total Paid,...,Attendee Status,College Name,How did you come to know about this event?,Specify in Others (how did you come to know about this event),Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge,Placement Status
0,ANIKET,aniket@xyz.com,1,,Art of Resume Building,2213855000.0,,Free Order,USD,0,...,Attending,D Y PATIL INSTITUTE OF MCA AND MANAGEMENT AKUR...,Email,,Students,,6.7,2,5,Placed
1,Dhanshree,dhanshree@xyz.com,1,,Art of Resume Building,2213859000.0,,Free Order,USD,0,...,Attending,AP SHAH INSTITUTE OF TECHNOLOGY,Others,College,Students,,8.2,3,2,Not placed
2,Dhiraj,dhiraj@xyz.com,1,,Art of Resume Building,2213862000.0,,Free Order,USD,0,...,Attending,Don Bosco College of Engineering Fatorda Goa,Email,,Students,,6.5,4,3,Not placed
3,Pooja,pooja@xyz.com,1,,Art of Resume Building,2213988000.0,,Free Order,USD,0,...,Attending,Pillai College of Engineering New Panvel,Email,,Students,,8.7,2,5,Not placed
4,Aayush,aayush@xyz.com,1,,Art of Resume Building,2214567000.0,,Free Order,USD,0,...,Attending,St Xavier's College,Instagram | LinkedIn | Cloud Counselage Website,,Students,,9.1,3,5,Placed


# Selecting only useful columns

In [11]:
useful_columns = ["Ticket Type", "Attendee #", "Attendee Status", "College Name",
                  "Designation", "Year of Graduation", "CGPA", "Speaking Skills", 
                  "ML Knowledge", "Placement Status"]

train_data = train_data[useful_columns]

In [12]:
train_data.head()

Unnamed: 0,Ticket Type,Attendee #,Attendee Status,College Name,Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge,Placement Status
0,Art of Resume Building,2213855000.0,Attending,D Y PATIL INSTITUTE OF MCA AND MANAGEMENT AKUR...,Students,,6.7,2,5,Placed
1,Art of Resume Building,2213859000.0,Attending,AP SHAH INSTITUTE OF TECHNOLOGY,Students,,8.2,3,2,Not placed
2,Art of Resume Building,2213862000.0,Attending,Don Bosco College of Engineering Fatorda Goa,Students,,6.5,4,3,Not placed
3,Art of Resume Building,2213988000.0,Attending,Pillai College of Engineering New Panvel,Students,,8.7,2,5,Not placed
4,Art of Resume Building,2214567000.0,Attending,St Xavier's College,Students,,9.1,3,5,Placed


# Imputation for numerical columns

In [13]:
numerical_columns = train_data.select_dtypes(include=['int64', 'float64']).columns
numerical_imputer = SimpleImputer(strategy='mean')
train_data[numerical_columns] = numerical_imputer.fit_transform(train_data[numerical_columns])

# Data Scaling and Transformation

In [14]:
scaler = StandardScaler()
train_data[numerical_columns] = scaler.fit_transform(train_data[numerical_columns])

# Converting categorical columns to numeric using Label Encoding

In [15]:
categorical_columns = ["Ticket Type", "Attendee Status",
                       "College Name", "Designation",
                       "Placement Status"]

for column in categorical_columns:
    le = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column])

In [16]:
train_data[column]

0       1
1       0
2       0
3       0
4       1
       ..
4829    2
4831    2
4832    2
4834    2
4836    2
Name: Placement Status, Length: 1986, dtype: int32

In [17]:
train_data

Unnamed: 0,Ticket Type,Attendee #,Attendee Status,College Name,Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge,Placement Status
0,0,-0.375282,0,5,31,0.0,-1.321069,-1.160410,1.326838,1
1,0,-0.375277,0,2,31,0.0,0.171673,-0.329933,-1.174840,0
2,0,-0.375273,0,7,31,0.0,-1.520102,0.500544,-0.340947,0
3,0,-0.375105,0,16,31,0.0,0.669254,-1.160410,1.326838,0
4,0,-0.374332,0,20,31,0.0,1.067318,-0.329933,1.326838,1
...,...,...,...,...,...,...,...,...,...,...
4829,15,-0.351863,0,44,31,0.0,1.067318,1.331021,0.492946,2
4831,15,-0.351821,0,52,31,0.0,-0.624456,-0.329933,-1.174840,2
4832,15,-0.351810,0,36,31,0.0,-0.723972,0.500544,0.492946,2
4834,15,-0.351755,0,53,31,0.0,1.166834,0.500544,-1.174840,2


# Separating features (x_train) and target (y_train)

In [18]:
x = train_data.drop("Placement Status", axis=1)
y = train_data["Placement Status"]

In [19]:
x

Unnamed: 0,Ticket Type,Attendee #,Attendee Status,College Name,Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge
0,0,-0.375282,0,5,31,0.0,-1.321069,-1.160410,1.326838
1,0,-0.375277,0,2,31,0.0,0.171673,-0.329933,-1.174840
2,0,-0.375273,0,7,31,0.0,-1.520102,0.500544,-0.340947
3,0,-0.375105,0,16,31,0.0,0.669254,-1.160410,1.326838
4,0,-0.374332,0,20,31,0.0,1.067318,-0.329933,1.326838
...,...,...,...,...,...,...,...,...,...
4829,15,-0.351863,0,44,31,0.0,1.067318,1.331021,0.492946
4831,15,-0.351821,0,52,31,0.0,-0.624456,-0.329933,-1.174840
4832,15,-0.351810,0,36,31,0.0,-0.723972,0.500544,0.492946
4834,15,-0.351755,0,53,31,0.0,1.166834,0.500544,-1.174840


In [20]:
y

0       1
1       0
2       0
3       0
4       1
       ..
4829    2
4831    2
4832    2
4834    2
4836    2
Name: Placement Status, Length: 1986, dtype: int32

# Spliting the data into train and validation sets

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Model that can handle missing values natively

In [22]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

# Loading test data

In [23]:
test_data = pd.read_excel("Test Data.xlsx")

# Data Inspection

In [24]:
test_data.head()

Unnamed: 0,First Name,Email ID,Quantity,Price Tier,Ticket Type,Attendee #,Group,Order Type,Currency,Total Paid,...,Attendee Status,College Name,How did you come to know about this event?,Specify in Others (how did you come to know about this event),Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge,Placement Status
0,Sahil,sahil@xyz.com,1,,Hello ML and DL,2293940000.0,,Free Order,USD,0,...,Attending,"symbiosis institute of technology, pune",Whatsapp,,Students,,7.8,3,3,
1,Amrita,amrita@xyz.com,1,,Hello ML and DL,2293941000.0,,Free Order,USD,0,...,Attending,"mit academy of engineering ,alandi",Whatsapp,,Students,,9.1,3,3,
2,Mamta,mamta@xyz.com,1,,Hello ML and DL,2293941000.0,,Free Order,USD,0,...,Attending,a. c. patil college of engineering,Whatsapp,,Students,,6.9,2,2,
3,Bhagyashri,bhagyashri@xyz.com,1,,Hello ML and DL,2293946000.0,,Free Order,USD,0,...,Attending,wilson college,Others,,Students,,8.4,4,4,
4,Divyanshu,divyanshu@xyz.com,1,,Hello ML and DL,2293956000.0,,Free Order,USD,0,...,Attending,"ld college of engineering, ahmedabad, gujarat",Whatsapp,,Students,,6.7,5,5,


In [25]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3796 entries, 0 to 3795
Data columns (total 23 columns):
 #   Column                                                         Non-Null Count  Dtype  
---  ------                                                         --------------  -----  
 0   First Name                                                     3796 non-null   object 
 1   Email ID                                                       3796 non-null   object 
 2   Quantity                                                       3796 non-null   int64  
 3   Price Tier                                                     0 non-null      float64
 4   Ticket Type                                                    3796 non-null   object 
 5   Attendee #                                                     3794 non-null   float64
 6   Group                                                          0 non-null      float64
 7   Order Type                                                  

# Identifing and locating missing values

In [26]:
missing_data = test_data.isnull().sum()

In [27]:
missing_data

First Name                                                          0
Email ID                                                            0
Quantity                                                            0
Price Tier                                                       3796
Ticket Type                                                         0
Attendee #                                                          2
Group                                                            3796
Order Type                                                          0
Currency                                                            2
Total Paid                                                          0
Fees Paid                                                           2
Eventite Fees                                                       0
Eventite Payment Processing                                         0
Attendee Status                                                     0
College Name        

# Cleaning the data using emails

In [28]:
test_data = test_data.drop_duplicates(subset=["Email ID"])

In [29]:
test_data.head()

Unnamed: 0,First Name,Email ID,Quantity,Price Tier,Ticket Type,Attendee #,Group,Order Type,Currency,Total Paid,...,Attendee Status,College Name,How did you come to know about this event?,Specify in Others (how did you come to know about this event),Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge,Placement Status
0,Sahil,sahil@xyz.com,1,,Hello ML and DL,2293940000.0,,Free Order,USD,0,...,Attending,"symbiosis institute of technology, pune",Whatsapp,,Students,,7.8,3,3,
1,Amrita,amrita@xyz.com,1,,Hello ML and DL,2293941000.0,,Free Order,USD,0,...,Attending,"mit academy of engineering ,alandi",Whatsapp,,Students,,9.1,3,3,
2,Mamta,mamta@xyz.com,1,,Hello ML and DL,2293941000.0,,Free Order,USD,0,...,Attending,a. c. patil college of engineering,Whatsapp,,Students,,6.9,2,2,
3,Bhagyashri,bhagyashri@xyz.com,1,,Hello ML and DL,2293946000.0,,Free Order,USD,0,...,Attending,wilson college,Others,,Students,,8.4,4,4,
4,Divyanshu,divyanshu@xyz.com,1,,Hello ML and DL,2293956000.0,,Free Order,USD,0,...,Attending,"ld college of engineering, ahmedabad, gujarat",Whatsapp,,Students,,6.7,5,5,


# Creating deep copies of your dataframe

In [30]:
test_data_backup = copy.deepcopy(test_data)

In [31]:
test_data_backup.head()

Unnamed: 0,First Name,Email ID,Quantity,Price Tier,Ticket Type,Attendee #,Group,Order Type,Currency,Total Paid,...,Attendee Status,College Name,How did you come to know about this event?,Specify in Others (how did you come to know about this event),Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge,Placement Status
0,Sahil,sahil@xyz.com,1,,Hello ML and DL,2293940000.0,,Free Order,USD,0,...,Attending,"symbiosis institute of technology, pune",Whatsapp,,Students,,7.8,3,3,
1,Amrita,amrita@xyz.com,1,,Hello ML and DL,2293941000.0,,Free Order,USD,0,...,Attending,"mit academy of engineering ,alandi",Whatsapp,,Students,,9.1,3,3,
2,Mamta,mamta@xyz.com,1,,Hello ML and DL,2293941000.0,,Free Order,USD,0,...,Attending,a. c. patil college of engineering,Whatsapp,,Students,,6.9,2,2,
3,Bhagyashri,bhagyashri@xyz.com,1,,Hello ML and DL,2293946000.0,,Free Order,USD,0,...,Attending,wilson college,Others,,Students,,8.4,4,4,
4,Divyanshu,divyanshu@xyz.com,1,,Hello ML and DL,2293956000.0,,Free Order,USD,0,...,Attending,"ld college of engineering, ahmedabad, gujarat",Whatsapp,,Students,,6.7,5,5,


# Selecting only useful columns

In [32]:
useful_columns = ["Ticket Type", "Attendee #", "Attendee Status", "College Name",
                  "Designation", "Year of Graduation", "CGPA", "Speaking Skills", "ML Knowledge"]

test_data = test_data[useful_columns]

In [33]:
test_data.head()

Unnamed: 0,Ticket Type,Attendee #,Attendee Status,College Name,Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge
0,Hello ML and DL,2293940000.0,Attending,"symbiosis institute of technology, pune",Students,,7.8,3,3
1,Hello ML and DL,2293941000.0,Attending,"mit academy of engineering ,alandi",Students,,9.1,3,3
2,Hello ML and DL,2293941000.0,Attending,a. c. patil college of engineering,Students,,6.9,2,2
3,Hello ML and DL,2293946000.0,Attending,wilson college,Students,,8.4,4,4
4,Hello ML and DL,2293956000.0,Attending,"ld college of engineering, ahmedabad, gujarat",Students,,6.7,5,5


# Imputation for numerical columns

In [34]:
numerical_columns = test_data.select_dtypes(include=['int64', 'float64']).columns
numerical_imputer = SimpleImputer(strategy='mean')
test_data[numerical_columns] = numerical_imputer.fit_transform(test_data[numerical_columns])

# Data Scaling and Transformation

In [35]:
scaler = StandardScaler()
test_data[numerical_columns] = scaler.fit_transform(test_data[numerical_columns])

# Converting categorical columns to numeric using Label Encoding

In [36]:
categorical_columns = ["Ticket Type", "Attendee Status",
                       "College Name", "Designation"
                       ]

for column in categorical_columns:
    le = LabelEncoder()
    test_data[column] = le.fit_transform(test_data[column])

In [37]:
test_data[column]

0       45
1       45
2       45
3       45
4       45
        ..
3685    45
3687    45
3692    45
3693    45
3695    45
Name: Designation, Length: 2321, dtype: int32

In [38]:
test_data

Unnamed: 0,Ticket Type,Attendee #,Attendee Status,College Name,Designation,Year of Graduation,CGPA,Speaking Skills,ML Knowledge
0,0,-0.269321,0,49,45,5.674538e-13,-0.233957,-0.254328,-0.240896
1,0,-0.269321,0,41,45,5.674538e-13,1.071365,-0.254328,-0.240896
2,0,-0.269320,0,27,45,5.674538e-13,-1.137641,-1.076466,-1.061922
3,0,-0.269316,0,53,45,5.674538e-13,0.368499,0.567810,0.580131
4,0,-0.269304,0,39,45,5.674538e-13,-1.338460,1.389948,1.401157
...,...,...,...,...,...,...,...,...,...
3685,11,-0.339259,0,34,45,5.674538e-13,-0.033138,-0.254328,-0.240896
3687,11,-0.338720,0,47,45,5.674538e-13,0.468909,1.389948,1.401157
3692,11,-0.338606,0,33,45,5.674538e-13,0.770137,-0.254328,-0.240896
3693,11,-0.338598,0,39,45,5.674538e-13,1.272184,0.567810,0.580131


# Model Evaluation

In [39]:
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.8266331658291457


# Calculate precision and recall

In [40]:
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
print("Validation Precision:", precision)
print("Validation Recall:", recall)

Validation Precision: 0.8025177677464536
Validation Recall: 0.8266331658291457


# Making predictions on test data

In [41]:
predictions = model.predict(test_data)

In [42]:
predictions

array([0, 0, 0, ..., 2, 2, 2])

# Saving predictions to a separate excel file

In [43]:
output_df = pd.DataFrame({"First Name": test_data_backup["First Name"], "Email ID": test_data_backup["Email ID"], **test_data, "Placement Status": predictions})
output_df.to_excel("Prediction of Placement Status.xlsx", index=False)