logestic regression algorithm


Accuracy = (TP + TN) / (TP + TN + FP + FN)
​

In [4]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import warnings

warnings.filterwarnings('ignore')

# Load the dataset
dataset = pd.read_csv('/content/DataAnalyst.csv')

# Data Cleaning and Preprocessing
# (Assuming that the dataset includes columns like 'Job Title', 'Job Description', 'Location', 'Rating', etc.)

# Step 1: Identify various Big Data job families in the given dataset
job_descriptions = dataset["Job Description"].tolist()
big_data_skills = ["big data", "hadoop", "spark", "impala", "cassandra", "kafka", "hdfs", "hbase", "hive", "mongo db", 'flume', 'sqoop', 'flink']

big_data_required = defaultdict(int)

for skill in big_data_skills:
    for description in job_descriptions:
        if skill in description.lower():
            big_data_required[skill] += 1

# Create a DataFrame for Big Data job families
big_data_df = pd.DataFrame(list(big_data_required.items()), columns=['Big Data Technologies', 'Skills Requirement'])
big_data_df.sort_values(['Skills Requirement'], axis=0, ascending=False, inplace=True)

dataset['BigDataSkill'] = dataset['Job Description'].apply(lambda x: any(skill in x.lower() for skill in big_data_skills))

# Feature Engineering
X = dataset['Job Description']
y = dataset['BigDataSkill']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Build a logistic regression model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Evaluate the model
y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9246119733924612

Classification Report:
               precision    recall  f1-score   support

       False       0.93      0.99      0.96       382
        True       0.91      0.57      0.70        69

    accuracy                           0.92       451
   macro avg       0.92      0.78      0.83       451
weighted avg       0.92      0.92      0.92       451



Random forest algorithm

In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import warnings

warnings.filterwarnings('ignore')

# Load the dataset
dataset = pd.read_csv('/content/DataAnalyst.csv')

# Data Cleaning and Preprocessing
# (Assuming that the dataset includes columns like 'Job Title', 'Job Description', 'Location', 'Rating', etc.)

# Step 1: Identify various Big Data job families in the given dataset
job_descriptions = dataset["Job Description"].tolist()
big_data_skills = ["big data", "hadoop", "spark", "impala", "cassandra", "kafka", "hdfs", "hbase", "hive", "mongo db", 'flume', 'sqoop', 'flink']

big_data_required = defaultdict(int)

for skill in big_data_skills:
    for description in job_descriptions:
        if skill in description.lower():
            big_data_required[skill] += 1

# Create a DataFrame for Big Data job families
big_data_df = pd.DataFrame(list(big_data_required.items()), columns=['Big Data Technologies', 'Skills Requirement'])
big_data_df.sort_values(['Skills Requirement'], axis=0, ascending=False, inplace=True)

dataset['BigDataSkill'] = dataset['Job Description'].apply(lambda x: any(skill in x.lower() for skill in big_data_skills))

# Feature Engineering
X = dataset['Job Description']
y = dataset['BigDataSkill']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Build a Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train_vec, y_train)

# Evaluate the model
y_pred_rf = rf_model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

Accuracy: 0.917960088691796

Classification Report:
               precision    recall  f1-score   support

       False       0.91      1.00      0.95       382
        True       1.00      0.46      0.63        69

    accuracy                           0.92       451
   macro avg       0.96      0.73      0.79       451
weighted avg       0.93      0.92      0.90       451



Decision tree

In [3]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
import warnings

warnings.filterwarnings('ignore')

# Load the dataset
dataset = pd.read_csv('/content/DataAnalyst.csv')

# Data Cleaning and Preprocessing
# (Assuming that the dataset includes columns like 'Job Title', 'Job Description', 'Location', 'Rating', etc.)

# Step 1: Identify various Big Data job families in the given dataset
job_descriptions = dataset["Job Description"].tolist()
big_data_skills = ["big data", "hadoop", "spark", "impala", "cassandra", "kafka", "hdfs", "hbase", "hive", "mongo db", 'flume', 'sqoop', 'flink']

big_data_required = defaultdict(int)

for skill in big_data_skills:
    for description in job_descriptions:
        if skill in description.lower():
            big_data_required[skill] += 1

# Create a DataFrame for Big Data job families
big_data_df = pd.DataFrame(list(big_data_required.items()), columns=['Big Data Technologies', 'Skills Requirement'])
big_data_df.sort_values(['Skills Requirement'], axis=0, ascending=False, inplace=True)

dataset['BigDataSkill'] = dataset['Job Description'].apply(lambda x: any(skill in x.lower() for skill in big_data_skills))

# Feature Engineering
X = dataset['Job Description']
y = dataset['BigDataSkill']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Build a Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_vec, y_train)

# Evaluate the model
y_pred_dt = dt_model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("\nClassification Report:\n", classification_report(y_test, y_pred_dt))

Accuracy: 0.9711751662971175

Classification Report:
               precision    recall  f1-score   support

       False       0.98      0.99      0.98       382
        True       0.94      0.87      0.90        69

    accuracy                           0.97       451
   macro avg       0.96      0.93      0.94       451
weighted avg       0.97      0.97      0.97       451

