In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Load and Understand the Data

# Load the dataset
df = pd.read_csv('Admission_Predict_Ver1.1.csv')

# Check the first few rows of the dataset
print(df.head())

# Check the data types and missing values
print(df.info())


df=df.rename(columns={'Serial No.':'Serial_No','GRE Score':'GRE_Score','TOEFL Score':'TOEFL_Score','University Rating':'University_rating','LOR ':'LOR'
                     ,'chance of admit ':'Chance_of_Admit'})


# Analyze the distribution of the target variable 'Chance of Admit'
target_variable = 'Chance_of_Admit'
target_distribution = df[target_variable].value_counts(normalize=True)
print("Distribution of the Target Variable (Chance_of_Admit):")
print(target_distribution)






   Serial No.  GRE Score  TOEFL Score  University Rating  SOP  LOR   CGPA  \
0           1        337          118                  4  4.5   4.5  9.65   
1           2        324          107                  4  4.0   4.5  8.87   
2           3        316          104                  3  3.0   3.5  8.00   
3           4        322          110                  3  3.5   2.5  8.67   
4           5        314          103                  2  2.0   3.0  8.21   

   Research  chance of admit   
0         1              0.92  
1         1              0.76  
2         1              0.72  
3         1              0.80  
4         0              0.65  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Serial No.         500 non-null    int64  
 1   GRE Score          500 non-null    int64  
 2   TOEFL Score        500 non-null    int64  
 

In [25]:
df.drop(columns=['Serial_No'], inplace=True)
df

Unnamed: 0,GRE_Score,TOEFL_Score,University_rating,SOP,LOR,CGPA,Research,Chance_of_Admit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.00,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.80
4,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
495,332,108,5,4.5,4.0,9.02,1,0.87
496,337,117,5,5.0,5.0,9.87,1,0.96
497,330,120,5,4.5,5.0,9.56,1,0.93
498,312,103,4,4.0,5.0,8.43,0,0.73


In [None]:
num_greater_than_0_5 = df[df[target_variable] >= 0.5].shape[0]
num_less_than_0_5 = df[df[target_variable] < 0.5].shape[0]

print(f"Number of values greater than 0.5: {num_greater_than_0_5}")
print(f"Number of values less than 0.5: {num_less_than_0_5}")

In [26]:
# i have to use classification techniques for this dataset so i have to convert the continous variable of chanceof admit 
#in to dicrete  variable using threshold value



# Define the threshold to convert 'Chance of Admit' into a binary classification problem
threshold = 0.5

# Convert 'Chance of Admit' to binary labels (0 or 1)
df['Admission_Status'] = np.where(df['Chance_of_Admit'] >= threshold, 1, 0)

# Drop the original 'Chance of Admit' column
df.drop(columns=['Chance_of_Admit'], inplace=True)

# Separate features (X) and the binary target variable (y)
X = df.drop(columns=['Admission_Status'])
y = df['Admission_Status']


In [27]:

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Feature Selection/Engineering (optional)
# You can perform feature selection/engineering based on domain knowledge and feature importance analysis.

# Step 4: Train a Classification Model (Random Forest Classifier)

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# 

RandomForestClassifier(random_state=42)

In [28]:
#Step 5: Model Evaluation

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)



In [31]:


# Example input data for prediction
input_data = pd.DataFrame({
    'GRE_Score': [320],
    'TOEFL_Score': [110],
    'University_rating': [5],
    'SOP': [4.5],
    'LOR': [4.0],
    'CGPA': [9.0],
    'Research': [1]
})

# Make predictions on the input data
predicted_admission_status = rf_classifier.predict(input_data)

# Print the prediction
print("Predicted Admission Status:", predicted_admission_status[0])


Predicted Admission Status: 1


In [32]:

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

# Generate a classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Step 6: Make Predictions (Optional)
# You can use the trained model to make predictions on new, unseen data.

# Step 7: Communicate Results (Optional)
# Present the model's performance and insights in a clear and concise manner.

# Step 8: Iterate and Improve (Optional)
# If needed, iterate and try different approaches to improve model performance

Model Accuracy: 0.94
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.44      0.57         9
           1       0.95      0.99      0.97        91

    accuracy                           0.94       100
   macro avg       0.87      0.72      0.77       100
weighted avg       0.93      0.94      0.93       100

Confusion Matrix:
[[ 4  5]
 [ 1 90]]


In [None]:
Step 1: Understanding the Dataset

Download the dataset from the provided Kaggle link and load it into your preferred data analysis environment (Python, R, etc.).
Explore the dataset to understand its structure, features, and target variable (in this case, "Chance of Admit").
Check for missing values, data types, and any data preprocessing steps that may be required.
Step 2: Data Preprocessing

Handle missing data: Impute or remove missing values based on the nature of the missing data.
Feature engineering: Analyze and transform existing features or create new features that might improve the model's performance.
Feature scaling: If necessary, scale numerical features to bring them to a similar range, which helps the model convergence.
Step 3: Splitting the Data

Split the dataset into a training set and a testing set. The training set will be used to train the classification model, and the testing set will be used for evaluation.
Step 4: Selecting a Classification Model

Research and choose a suitable classification algorithm for the problem. Commonly used algorithms include Logistic Regression, Decision Trees, Random Forest, Support Vector Machines, etc.
Import the necessary libraries for the selected algorithm.
Step 5: Model Training

Train the selected classification model on the training set using the fit() function or the appropriate method for the chosen algorithm.
Step 6: Model Evaluation

Predict the admission probabilities on the testing set using the trained model.
Evaluate the model's performance using relevant metrics such as accuracy, precision, recall, F1-score, ROC-AUC, etc.
Step 7: Model Tuning (Optional)

If the model's performance is not satisfactory, consider hyperparameter tuning or trying different algorithms to improve the results.
Use techniques like cross-validation or grid search to find the best hyperparameters for the model.
Step 8: Interpretation and Visualization

Interpret the results and understand which features are most influential in predicting graduate admissions.
Visualize important features, model performance metrics, and any other relevant insights.
Step 9: Conclusion and Reporting

Summarize the findings, including the chosen model's performance and any insights gained from the analysis.
Create a report or presentation to present your work and results to stakeholders or supervisors.
Step 10: Deployment (Optional)

If required, deploy the trained model to a production environment for real-time predictions.
Remember, this sequence may vary depending on the specific challenges you encounter during the analysis. As a data science intern, it's essential to be open to learning and adapt to different scenarios as you work on the project. Good luck with your graduate admissions prediction project!