# <<<<<<<-----++++----LINKEDIN JOBS ANALYSIS-----++++------->>>>>>>

## Importing Libraries

In [176]:
# Importing below Python libraries for data analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Importing the warnings module for handling warning messages
import warnings

# Ignoring warning messages to prevent interruptions during code execution
warnings.filterwarnings("ignore")

## Reading our Dataset

In [177]:
# Reading the CSV file into a pandas DataFrame
df = pd.read_csv(r"D:\UNIT_PROJECT_ML\PYTHON\EXCEL\SCRAPED_DATA\scrapped data.csv")

In [178]:
# Displaying the first few rows of the DataFrame
df.head()

Unnamed: 0,Designation,Name,Location,Level_and_involvement,job_description,Total_applicants,Industry_and_Employee_count,LinkedIn_Followers
0,Project Manager,Wipro,"Mumbai, Maharashtra, India",Full-time · Entry level,Role Purpose \nThe purpose of the role is to ...,,"10,001+ employees · IT Services and IT Consulting","6,737,208 followers"
1,Project Manager,Wipro,"Delhi, Delhi, India",Full-time · Entry level,Role Purpose \nThe purpose of the role is to ...,121 applicants,"10,001+ employees · IT Services and IT Consulting","6,737,209 followers"
2,Project Manager,Wipro,"Delhi, Delhi, India",Full-time · Entry level,Role Purpose \nThe purpose of the role is to ...,,"10,001+ employees · IT Services and IT Consulting","6,737,210 followers"
3,Administrator,Wipro,"Hyderabad, Telangana, India",Full-time · Mid-Senior level,Role Purpose \nThe purpose of the role is to ...,,"10,001+ employees · IT Services and IT Consulting","6,737,211 followers"
4,Java Developer,LTIMindtree,"Hyderabad, Telangana, India",Full-time · Entry level,This job is sourced from a job board. Learn mo...,22 applicants,"10,001+ employees · IT Services and IT Consulting","270,113 followers"


# ---------------------Data Cleaning and Noise Handling----------------------

## Spliting Level_and_involvement Column 

In [179]:
# Splitting the 'Level_and_involvement' column and assigning the split values to new columns 'Level' and 'Involvement'
df[['Level', 'Involvement']] = df['Level_and_involvement'].str.split('·', expand=True)

In [180]:
# Displaying the first few rows of the DataFrame
df.head()

Unnamed: 0,Designation,Name,Location,Level_and_involvement,job_description,Total_applicants,Industry_and_Employee_count,LinkedIn_Followers,Level,Involvement
0,Project Manager,Wipro,"Mumbai, Maharashtra, India",Full-time · Entry level,Role Purpose \nThe purpose of the role is to ...,,"10,001+ employees · IT Services and IT Consulting","6,737,208 followers",Full-time,Entry level
1,Project Manager,Wipro,"Delhi, Delhi, India",Full-time · Entry level,Role Purpose \nThe purpose of the role is to ...,121 applicants,"10,001+ employees · IT Services and IT Consulting","6,737,209 followers",Full-time,Entry level
2,Project Manager,Wipro,"Delhi, Delhi, India",Full-time · Entry level,Role Purpose \nThe purpose of the role is to ...,,"10,001+ employees · IT Services and IT Consulting","6,737,210 followers",Full-time,Entry level
3,Administrator,Wipro,"Hyderabad, Telangana, India",Full-time · Mid-Senior level,Role Purpose \nThe purpose of the role is to ...,,"10,001+ employees · IT Services and IT Consulting","6,737,211 followers",Full-time,Mid-Senior level
4,Java Developer,LTIMindtree,"Hyderabad, Telangana, India",Full-time · Entry level,This job is sourced from a job board. Learn mo...,22 applicants,"10,001+ employees · IT Services and IT Consulting","270,113 followers",Full-time,Entry level


In [181]:
# Dropping the 'Level_and_involvement' column from the DataFrame
df.drop('Level_and_involvement', axis=1, inplace=True)

In [182]:
# Displaying the first few rows of the DataFrame
df.head()

Unnamed: 0,Designation,Name,Location,job_description,Total_applicants,Industry_and_Employee_count,LinkedIn_Followers,Level,Involvement
0,Project Manager,Wipro,"Mumbai, Maharashtra, India",Role Purpose \nThe purpose of the role is to ...,,"10,001+ employees · IT Services and IT Consulting","6,737,208 followers",Full-time,Entry level
1,Project Manager,Wipro,"Delhi, Delhi, India",Role Purpose \nThe purpose of the role is to ...,121 applicants,"10,001+ employees · IT Services and IT Consulting","6,737,209 followers",Full-time,Entry level
2,Project Manager,Wipro,"Delhi, Delhi, India",Role Purpose \nThe purpose of the role is to ...,,"10,001+ employees · IT Services and IT Consulting","6,737,210 followers",Full-time,Entry level
3,Administrator,Wipro,"Hyderabad, Telangana, India",Role Purpose \nThe purpose of the role is to ...,,"10,001+ employees · IT Services and IT Consulting","6,737,211 followers",Full-time,Mid-Senior level
4,Java Developer,LTIMindtree,"Hyderabad, Telangana, India",This job is sourced from a job board. Learn mo...,22 applicants,"10,001+ employees · IT Services and IT Consulting","270,113 followers",Full-time,Entry level


## Removing Noise from Total_applicants Column

In [183]:
# Modifying the 'Total_applicants' column

# Remove ' applicants' from the column values
df['Total_applicants'] = df['Total_applicants'].str.replace(' applicants', '')

# Remove ' applicant' from the column values
df['Total_applicants'] = df['Total_applicants'].str.replace(' applicant', '')

# Replace missing values with 0
df['Total_applicants'] = df['Total_applicants'].fillna(0)

# Convert the column to integer data type
df['Total_applicants'] = df['Total_applicants'].astype(int)


# In above code we can make use of regular expression pattern ' applicants?' with a question mark. 
#This pattern matches both 'applicants' and 'applicant', allowing for variations in the string
df['Total_applicants'] = df['Total_applicants'].str.replace(' applicants?', '')

In [184]:
# d["amount"]=d["amount"].str.replace("[^0-9\.]", "", regex=True)

In [185]:
# Displaying the first few rows of the DataFrame
df.head()

Unnamed: 0,Designation,Name,Location,job_description,Total_applicants,Industry_and_Employee_count,LinkedIn_Followers,Level,Involvement
0,Project Manager,Wipro,"Mumbai, Maharashtra, India",Role Purpose \nThe purpose of the role is to ...,0,"10,001+ employees · IT Services and IT Consulting","6,737,208 followers",Full-time,Entry level
1,Project Manager,Wipro,"Delhi, Delhi, India",Role Purpose \nThe purpose of the role is to ...,121,"10,001+ employees · IT Services and IT Consulting","6,737,209 followers",Full-time,Entry level
2,Project Manager,Wipro,"Delhi, Delhi, India",Role Purpose \nThe purpose of the role is to ...,0,"10,001+ employees · IT Services and IT Consulting","6,737,210 followers",Full-time,Entry level
3,Administrator,Wipro,"Hyderabad, Telangana, India",Role Purpose \nThe purpose of the role is to ...,0,"10,001+ employees · IT Services and IT Consulting","6,737,211 followers",Full-time,Mid-Senior level
4,Java Developer,LTIMindtree,"Hyderabad, Telangana, India",This job is sourced from a job board. Learn mo...,22,"10,001+ employees · IT Services and IT Consulting","270,113 followers",Full-time,Entry level


## Spliting of Industry_and_Employee_count Column

In [186]:
# Splitting the 'Industry_and_Employee_count' column using '·' as the separator
# and expanding it into two separate columns 'Employee_count' and 'Industry'
df[['Employee_count', 'Industry']] = df['Industry_and_Employee_count'].str.split('·', expand=True)


In [187]:
# Dropping the 'Industry_and_Employee_count' column from the DataFrame
df.drop('Industry_and_Employee_count', axis=1, inplace=True)

In [188]:
# Displaying the first few rows of the DataFrame
df.head()

Unnamed: 0,Designation,Name,Location,job_description,Total_applicants,LinkedIn_Followers,Level,Involvement,Employee_count,Industry
0,Project Manager,Wipro,"Mumbai, Maharashtra, India",Role Purpose \nThe purpose of the role is to ...,0,"6,737,208 followers",Full-time,Entry level,"10,001+ employees",IT Services and IT Consulting
1,Project Manager,Wipro,"Delhi, Delhi, India",Role Purpose \nThe purpose of the role is to ...,121,"6,737,209 followers",Full-time,Entry level,"10,001+ employees",IT Services and IT Consulting
2,Project Manager,Wipro,"Delhi, Delhi, India",Role Purpose \nThe purpose of the role is to ...,0,"6,737,210 followers",Full-time,Entry level,"10,001+ employees",IT Services and IT Consulting
3,Administrator,Wipro,"Hyderabad, Telangana, India",Role Purpose \nThe purpose of the role is to ...,0,"6,737,211 followers",Full-time,Mid-Senior level,"10,001+ employees",IT Services and IT Consulting
4,Java Developer,LTIMindtree,"Hyderabad, Telangana, India",This job is sourced from a job board. Learn mo...,22,"270,113 followers",Full-time,Entry level,"10,001+ employees",IT Services and IT Consulting


## Removing Noise from Employee_count Column

In [189]:
# Remove commas from the 'Employee_count' column
df['Employee_count'] = df['Employee_count'].str.replace(',', '')

# Remove ' employees' suffix from the 'Employee_count' column
df['Employee_count'] = df['Employee_count'].str.replace(' employees', '')

# Remove '+' character from the 'Employee_count' column
df['Employee_count'] = df['Employee_count'].str.replace('+', '')

# Extract the last 6 characters from the 'Employee_count' column
df['Employee_count'] = df['Employee_count'].str[-6:]

# Remove '-' character from the 'Employee_count' column
df['Employee_count'] = df['Employee_count'].str.replace('-', '')

# Convert the cleaned values to integers
df['Employee_count'] = df['Employee_count'].astype(int)

In [190]:
# df['Employee_count'].str.replace(',', '').str.replace(' employees', '').str.replace('+', '').str[-6:]

In [191]:
# Displaying the first few rows of the DataFrame
df.head()

Unnamed: 0,Designation,Name,Location,job_description,Total_applicants,LinkedIn_Followers,Level,Involvement,Employee_count,Industry
0,Project Manager,Wipro,"Mumbai, Maharashtra, India",Role Purpose \nThe purpose of the role is to ...,0,"6,737,208 followers",Full-time,Entry level,10001,IT Services and IT Consulting
1,Project Manager,Wipro,"Delhi, Delhi, India",Role Purpose \nThe purpose of the role is to ...,121,"6,737,209 followers",Full-time,Entry level,10001,IT Services and IT Consulting
2,Project Manager,Wipro,"Delhi, Delhi, India",Role Purpose \nThe purpose of the role is to ...,0,"6,737,210 followers",Full-time,Entry level,10001,IT Services and IT Consulting
3,Administrator,Wipro,"Hyderabad, Telangana, India",Role Purpose \nThe purpose of the role is to ...,0,"6,737,211 followers",Full-time,Mid-Senior level,10001,IT Services and IT Consulting
4,Java Developer,LTIMindtree,"Hyderabad, Telangana, India",This job is sourced from a job board. Learn mo...,22,"270,113 followers",Full-time,Entry level,10001,IT Services and IT Consulting


## Removing Noise from LinkedIn_Followers Column

In [192]:
# Remove ' followers' suffix from the 'LinkedIn_Followers' column
df['LinkedIn_Followers'] = df['LinkedIn_Followers'].str.replace(' followers', '')

# Remove commas from the 'LinkedIn_Followers' column
df['LinkedIn_Followers'] = df['LinkedIn_Followers'].str.replace(',', '')

# df['LinkedIn_Followers'] = df['LinkedIn_Followers'].str.replace(' followers', '').str.replace(',', '')

In [193]:
# Displaying the first few rows of the DataFrame
df.head()

Unnamed: 0,Designation,Name,Location,job_description,Total_applicants,LinkedIn_Followers,Level,Involvement,Employee_count,Industry
0,Project Manager,Wipro,"Mumbai, Maharashtra, India",Role Purpose \nThe purpose of the role is to ...,0,6737208,Full-time,Entry level,10001,IT Services and IT Consulting
1,Project Manager,Wipro,"Delhi, Delhi, India",Role Purpose \nThe purpose of the role is to ...,121,6737209,Full-time,Entry level,10001,IT Services and IT Consulting
2,Project Manager,Wipro,"Delhi, Delhi, India",Role Purpose \nThe purpose of the role is to ...,0,6737210,Full-time,Entry level,10001,IT Services and IT Consulting
3,Administrator,Wipro,"Hyderabad, Telangana, India",Role Purpose \nThe purpose of the role is to ...,0,6737211,Full-time,Mid-Senior level,10001,IT Services and IT Consulting
4,Java Developer,LTIMindtree,"Hyderabad, Telangana, India",This job is sourced from a job board. Learn mo...,22,270113,Full-time,Entry level,10001,IT Services and IT Consulting


## Taking Median of Linkedin_followers for different Companies

In [194]:
# Calculate the median of 'LinkedIn_Followers' column grouped by 'Name'
data = df[['Name', 'LinkedIn_Followers']].groupby('Name').median().reset_index()

# Convert the 'Name' and 'LinkedIn_Followers' columns to a dictionary
dict_data = dict(zip(data['Name'], data['LinkedIn_Followers']))

# Update the 'LinkedIn_Followers' column in the original DataFrame with median values
for i in dict_data:
    # Find rows where 'Name' matches the current iteration value and update 'LinkedIn_Followers' 
    # with the corresponding median value
    df.loc[df['Name'] == i, 'LinkedIn_Followers'] = int(dict_data[i])


In [195]:
# d = dict(zip(data['Name'], data['Class']))
# d

In [196]:
# Displaying the first few rows of the DataFrame
df.head()

Unnamed: 0,Designation,Name,Location,job_description,Total_applicants,LinkedIn_Followers,Level,Involvement,Employee_count,Industry
0,Project Manager,Wipro,"Mumbai, Maharashtra, India",Role Purpose \nThe purpose of the role is to ...,0,6737300,Full-time,Entry level,10001,IT Services and IT Consulting
1,Project Manager,Wipro,"Delhi, Delhi, India",Role Purpose \nThe purpose of the role is to ...,121,6737300,Full-time,Entry level,10001,IT Services and IT Consulting
2,Project Manager,Wipro,"Delhi, Delhi, India",Role Purpose \nThe purpose of the role is to ...,0,6737300,Full-time,Entry level,10001,IT Services and IT Consulting
3,Administrator,Wipro,"Hyderabad, Telangana, India",Role Purpose \nThe purpose of the role is to ...,0,6737300,Full-time,Mid-Senior level,10001,IT Services and IT Consulting
4,Java Developer,LTIMindtree,"Hyderabad, Telangana, India",This job is sourced from a job board. Learn mo...,22,270280,Full-time,Entry level,10001,IT Services and IT Consulting


## Taking appropriate data from Location column

In [197]:
# Split the 'Location' column by comma and extract the second element
df['Location'] = df['Location'].str.split(",", expand=True)[1]

In [198]:
# Displaying the first few rows of the DataFrame
df.head()

Unnamed: 0,Designation,Name,Location,job_description,Total_applicants,LinkedIn_Followers,Level,Involvement,Employee_count,Industry
0,Project Manager,Wipro,Maharashtra,Role Purpose \nThe purpose of the role is to ...,0,6737300,Full-time,Entry level,10001,IT Services and IT Consulting
1,Project Manager,Wipro,Delhi,Role Purpose \nThe purpose of the role is to ...,121,6737300,Full-time,Entry level,10001,IT Services and IT Consulting
2,Project Manager,Wipro,Delhi,Role Purpose \nThe purpose of the role is to ...,0,6737300,Full-time,Entry level,10001,IT Services and IT Consulting
3,Administrator,Wipro,Telangana,Role Purpose \nThe purpose of the role is to ...,0,6737300,Full-time,Mid-Senior level,10001,IT Services and IT Consulting
4,Java Developer,LTIMindtree,Telangana,This job is sourced from a job board. Learn mo...,22,270280,Full-time,Entry level,10001,IT Services and IT Consulting


# ----------------------Null Handling------------------

## Checking missing values

In [199]:
# Calculate the number of null values in each column
null_counts = df.isnull().sum()

# Print the results
print(null_counts)

Designation            0
Name                   0
Location              83
job_description        0
Total_applicants       0
LinkedIn_Followers     0
Level                  0
Involvement            0
Employee_count         0
Industry               0
dtype: int64

## Dropping rows with missing values

In [200]:
# Remove rows with null values
df.dropna(inplace=True)

# Reset the index of the DataFrame
df.reset_index(inplace=True, drop=True)

#    --------------------------PHASE 2----------------------------

## Here we will try to collect certain skills mention below from our Job_description column

## Let's collect some most common technical skills

In [202]:
# The technical_skills list contains a collection of technical skills.
technical_skills = ['PYTHON', 'C++', 'JAVA', 'HADOOP', 'SCALA', 
                    'FLASK', 'PANDAS', 'SPARK', 'NUMPY', 
                    'PHP', 'SQL', 'MYSQL', 'CSS', 'MONGODB', 'NLTK', 'KERAS', 
                    'PYTORCH', 'TENSORFLOW', 'LINUX', 'RUBY', 'JAVASCRIPT', 
                    'DJANGO', 'REACT', 'REACTJS', 'AI', 'UI', 'TABLEAU', 
                    'NODEJS', 'EXCEL', 'POWER BI', 'SELENIUM', 'HTML', 'ML']

## Adding above technical_skills into our DataFrame ( filling them with 0 )

In [203]:
# Setting the values of technical skills columns to 0
df[technical_skills] = 0

In [204]:
# Displaying the first few rows of the DataFrame
df.head()

Unnamed: 0,Designation,Name,Location,job_description,Total_applicants,LinkedIn_Followers,Level,Involvement,Employee_count,Industry,...,REACTJS,AI,UI,TABLEAU,NODEJS,EXCEL,POWER BI,SELENIUM,HTML,ML
0,Project Manager,Wipro,Maharashtra,Role Purpose \nThe purpose of the role is to ...,0,6737300,Full-time,Entry level,10001,IT Services and IT Consulting,...,0,0,0,0,0,0,0,0,0,0
1,Project Manager,Wipro,Delhi,Role Purpose \nThe purpose of the role is to ...,121,6737300,Full-time,Entry level,10001,IT Services and IT Consulting,...,0,0,0,0,0,0,0,0,0,0
2,Project Manager,Wipro,Delhi,Role Purpose \nThe purpose of the role is to ...,0,6737300,Full-time,Entry level,10001,IT Services and IT Consulting,...,0,0,0,0,0,0,0,0,0,0
3,Administrator,Wipro,Telangana,Role Purpose \nThe purpose of the role is to ...,0,6737300,Full-time,Mid-Senior level,10001,IT Services and IT Consulting,...,0,0,0,0,0,0,0,0,0,0
4,Java Developer,LTIMindtree,Telangana,This job is sourced from a job board. Learn mo...,22,270280,Full-time,Entry level,10001,IT Services and IT Consulting,...,0,0,0,0,0,0,0,0,0,0


# --------------Extracting skills(keywords) from job_description coloumn

## We can perform same task with nlptk or keybert libraries

In [205]:
# Exammple of list comprehension
skills = df['job_description'].str.upper().apply(lambda x: [i.upper() for i in technical_skills if i in x])

In [206]:
# x = df.iloc[0,3].upper()

# def aastha(x):
#     for i in technical_skills:
#         if i in x:
#             print(i.upper())

# df['job'].apply(aastha)

In [207]:
skills

0                             [SCALA, AI, UI, EXCEL, ML]
1                             [SCALA, AI, UI, EXCEL, ML]
2                             [SCALA, AI, UI, EXCEL, ML]
3                                 [SCALA, AI, UI, EXCEL]
4                                  [JAVA, CSS, HTML, ML]
                             ...                        
837                                      [AI, UI, EXCEL]
838                                       [JAVA, AI, UI]
839                  [PYTHON, SCALA, FLASK, SQL, AI, UI]
840                                       [JAVA, AI, UI]
841    [PYTHON, C++, JAVA, HADOOP, SCALA, SPARK, SQL,...
Name: job_description, Length: 842, dtype: object

## Filling our Skills_columns and flaging them with appropriate value

In [208]:
# Setting the values of technical skills columns to 1 based on skill sets
for index, skill_set in enumerate(skills):
    for x in skill_set:
        df[x][index] = 1

In [209]:
df['EXCEL'][4]

0

In [210]:
# Displaying the first few rows of the DataFrame
df.head()

Unnamed: 0,Designation,Name,Location,job_description,Total_applicants,LinkedIn_Followers,Level,Involvement,Employee_count,Industry,...,REACTJS,AI,UI,TABLEAU,NODEJS,EXCEL,POWER BI,SELENIUM,HTML,ML
0,Project Manager,Wipro,Maharashtra,Role Purpose \nThe purpose of the role is to ...,0,6737300,Full-time,Entry level,10001,IT Services and IT Consulting,...,0,1,1,0,0,1,0,0,0,1
1,Project Manager,Wipro,Delhi,Role Purpose \nThe purpose of the role is to ...,121,6737300,Full-time,Entry level,10001,IT Services and IT Consulting,...,0,1,1,0,0,1,0,0,0,1
2,Project Manager,Wipro,Delhi,Role Purpose \nThe purpose of the role is to ...,0,6737300,Full-time,Entry level,10001,IT Services and IT Consulting,...,0,1,1,0,0,1,0,0,0,1
3,Administrator,Wipro,Telangana,Role Purpose \nThe purpose of the role is to ...,0,6737300,Full-time,Mid-Senior level,10001,IT Services and IT Consulting,...,0,1,1,0,0,1,0,0,0,0
4,Java Developer,LTIMindtree,Telangana,This job is sourced from a job board. Learn mo...,22,270280,Full-time,Entry level,10001,IT Services and IT Consulting,...,0,0,0,0,0,0,0,0,1,1


## Dropping such skill_column where there is no data!!

In [211]:
# Removing technical skills columns with the same unique value in all rows
for i in technical_skills:
    if len(df[i].unique()) == 1:
        df.drop(i, axis=1, inplace=True)

## Droping the Job Description column

In [212]:
df.drop('job_description', axis=1, inplace=True)

## Checking for Duplicates

In [213]:
df.duplicated().sum()

31

## Dropping Duplicates

In [214]:
df.drop_duplicates(inplace = True)

In [215]:
df.reset_index(inplace=True, drop=True)

In [216]:
# Displaying the first few rows of the DataFrame
df.head()

Unnamed: 0,Designation,Name,Location,Total_applicants,LinkedIn_Followers,Level,Involvement,Employee_count,Industry,PYTHON,...,REACTJS,AI,UI,TABLEAU,NODEJS,EXCEL,POWER BI,SELENIUM,HTML,ML
0,Project Manager,Wipro,Maharashtra,0,6737300,Full-time,Entry level,10001,IT Services and IT Consulting,0,...,0,1,1,0,0,1,0,0,0,1
1,Project Manager,Wipro,Delhi,121,6737300,Full-time,Entry level,10001,IT Services and IT Consulting,0,...,0,1,1,0,0,1,0,0,0,1
2,Project Manager,Wipro,Delhi,0,6737300,Full-time,Entry level,10001,IT Services and IT Consulting,0,...,0,1,1,0,0,1,0,0,0,1
3,Administrator,Wipro,Telangana,0,6737300,Full-time,Mid-Senior level,10001,IT Services and IT Consulting,0,...,0,1,1,0,0,1,0,0,0,0
4,Java Developer,LTIMindtree,Telangana,22,270280,Full-time,Entry level,10001,IT Services and IT Consulting,0,...,0,0,0,0,0,0,0,0,1,1


# -------------------Unsupervised Learning------------------

# With the help of clustering_algorithm we r gonna form different classes based on linkedin_followers and employees_count

In [217]:
# Creating a new DataFrame 'data' with selected columns from 'df'
data = df[['Name', 'Employee_count', 'LinkedIn_Followers']].copy()

In [218]:
# Removing duplicate rows in 'data' DataFrame
data.drop_duplicates(inplace=True)

# Resetting the index of 'data' DataFrame
data.reset_index(inplace=True, drop=True)

In [220]:
# Creating the feature matrix X by dropping the 'Name' column from the 'data' DataFrame
X = data.drop(['Name'], axis=1)

## Feature Scaling

In [221]:
pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [222]:
# Importing the necessary module and class
from sklearn.preprocessing import StandardScaler

# Creating an instance of the StandardScaler class
scaler = StandardScaler()

# Scaling the features in X
scaled_X = scaler.fit_transform(X)

## k means clustering model

In [225]:
# Importing the necessary module and class
from sklearn.cluster import KMeans

# Creating an instance of the KMeans class
model = KMeans(n_clusters=4)

## Predicting the class of each company

In [226]:
# Performing k-means clustering and obtaining cluster assignments
pred = model.fit_predict(scaled_X)

In [227]:
pred

array([2, 1, 3, 0, 3, 1, 1, 2, 0, 2, 1])

In [228]:
data['Class'] = pred

## Assigning the class to each company

In [229]:
# Mapping cluster labels to class names
data.loc[data['Class'] == 0, 'Class'] = 'Class 1'
data.loc[data['Class'] == 1, 'Class'] = 'Class 2'
data.loc[data['Class'] == 2, 'Class'] = 'Class 3'
data.loc[data['Class'] == 3, 'Class'] = 'Class 4'

In [230]:
# Drop the 'Employee_count' and 'LinkedIn_Followers' columns from the 'data' DataFrame
data.drop(['Employee_count', 'LinkedIn_Followers'], axis=1, inplace=True)

In [231]:
data

Unnamed: 0,Name,Class
0,Wipro,Class 3
1,LTIMindtree,Class 2
2,IDESLABS PRIVATE LIMITED,Class 4
3,Cactus Communications,Class 1
4,ACURA SOLUTIONS LTD,Class 4
5,UST,Class 2
6,Vodafone Idea Limited,Class 2
7,Tata Consultancy Services,Class 3
8,Uplers,Class 1
9,Infosys,Class 3


In [232]:
# Displaying the first few rows of the DataFrame
df.head()

Unnamed: 0,Designation,Name,Location,Total_applicants,LinkedIn_Followers,Level,Involvement,Employee_count,Industry,PYTHON,...,REACTJS,AI,UI,TABLEAU,NODEJS,EXCEL,POWER BI,SELENIUM,HTML,ML
0,Project Manager,Wipro,Maharashtra,0,6737300,Full-time,Entry level,10001,IT Services and IT Consulting,0,...,0,1,1,0,0,1,0,0,0,1
1,Project Manager,Wipro,Delhi,121,6737300,Full-time,Entry level,10001,IT Services and IT Consulting,0,...,0,1,1,0,0,1,0,0,0,1
2,Project Manager,Wipro,Delhi,0,6737300,Full-time,Entry level,10001,IT Services and IT Consulting,0,...,0,1,1,0,0,1,0,0,0,1
3,Administrator,Wipro,Telangana,0,6737300,Full-time,Mid-Senior level,10001,IT Services and IT Consulting,0,...,0,1,1,0,0,1,0,0,0,0
4,Java Developer,LTIMindtree,Telangana,22,270280,Full-time,Entry level,10001,IT Services and IT Consulting,0,...,0,0,0,0,0,0,0,0,1,1


## Merging the class of the company with the original DataFrame

In [233]:
# Merge the 'data' DataFrame with the 'df' DataFrame using an inner join on the 'Name' column
df = pd.merge(data, df, how='inner', on='Name')

# select df.*, data.class from df inner join data on df.Name=data.Name >> sql query

## Saving the Final Clean Dataset into a csv_file.

In [234]:
# Save the DataFrame 'df' to a CSV file named 'ML_final_data.csv'
df.to_csv('ML_final_data.csv', index=False)

# --------------Deeksha, Aastha, Ravi, Rahul---------------------------

# Phir Milenge Chalte-Chalte !!!!

In [235]:
# Save the DataFrame 'df' to a pickle file named 'ML_final_data(darr).pkl'
df.to_pickle('ML_final_data(darr).pkl')