In [None]:
import pandas as pd
import numpy as np
import bcrypt
from pymongo import MongoClient
import ssl
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer


  ## Importing the dataset 

In [None]:
data=pd.read_csv('merged_df.csv')

In [None]:
data.drop(['Unnamed: 0','_id','password','Timestamp'],axis=1,inplace=True)

In [None]:
data

## Data pre-processing

In [None]:
# Disable SSL verification
ssl._create_default_https_context = ssl._create_unverified_context

nltk.download('stopwords')

In [None]:
#Listing out stopwords in english
x=stopwords.words('english')

In [None]:
lemma = WordNetLemmatizer()

In [None]:
# Download required NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')

def general_preprocess(to_preprocess_column_name, df_name):
    to_preprocess_column_name_processed = []
    lemma = WordNetLemmatizer()
    
    for i in range(len(df_name)):
        col = df_name.iloc[i][to_preprocess_column_name]
        col = re.sub('[^a-zA-Z]', ' ', col)  # Remove non-alphabetic characters
        col = col.lower()  # Convert to lowercase
        col = col.split()  # Split into words
        col = [lemma.lemmatize(word) for word in col if word not in set(stopwords.words('english'))]  # Lemmatize and remove stopwords
        col = ' '.join(col)  # Join words back into a single string
        to_preprocess_column_name_processed.append(col)
    
    return to_preprocess_column_name_processed

In [None]:
Topic_preprocessed=general_preprocess('Topic',data)

In [None]:
Topic_preprocessed

In [None]:
Skills_preprocessed=general_preprocess('Skills',data)

In [None]:
Skills_preprocessed

## Combining target columns 

In [None]:
df_preprocessed = pd.DataFrame({
    'Skills': Skills_preprocessed,
    'Topic': Topic_preprocessed
})

In [None]:
data

In [None]:
data['Combined'] = data['Skills'] + ', ' + data['Topic']

# Drop the individual columns if you only need the combined column
df_combined = data.drop(columns=['Skills', 'Topic'])

In [None]:
df_combined

## TF-IDF

In [None]:
train_data=df_combined['Combined']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(train_data)

cosine_sim_matrix = cosine_similarity(tfidf_matrix)


In [None]:
cosine_sim_matrix.shape

## Affinity clustering 

Affinity Propagation is a clustering algorithm that does not require the number of clusters to be specified in advance. It works by passing messages between data points to identify clusters of similar items(similarity matrix)

In [None]:
from sklearn.cluster import AffinityPropagation
# Step 3: Apply Affinity Propagation
affinity_propagation = AffinityPropagation(affinity='precomputed', random_state=42)
labels = affinity_propagation.fit_predict(cosine_sim_matrix)

# Step 4: Create a DataFrame to view results
df = pd.DataFrame({'Skillset': train_data, 'Cluster': labels})

# Print the DataFrame with clusters
print("Clustered Skillsets with Affinity Propagation:")
print(df)

# Print the number of items in each cluster
cluster_counts = df['Cluster'].value_counts().sort_index()
print("\nNumber of Items in Each Cluster:")
print(cluster_counts)

In [None]:
#Printing the number of data in each cluster
cluster_counts = df['Cluster'].value_counts().sort_index()
print("\nNumber of Items in Each Cluster:")
print(cluster_counts)

## dimensionality r(TSNE) reduction and visualizing

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

tsne = TSNE(n_components=2, random_state=42, perplexity=30)
reduced_matrix = tsne.fit_transform(tfidf_matrix.toarray())

# Add t-SNE results to DataFrame
df_tsne = pd.DataFrame(reduced_matrix, columns=['TSNE1', 'TSNE2'])
df_tsne['Cluster'] = labels

# Plot the clusters
plt.figure(figsize=(12, 8))
sns.scatterplot(x='TSNE1', y='TSNE2', hue='Cluster', data=df_tsne, palette='viridis', marker='o', s=100)
plt.title('Clusters Visualized with t-SNE')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend(title='Cluster')
plt.savefig('tf_idf_cluster.png', dpi=600, bbox_inches='tight')
plt.show()

In [None]:
plt.tight_layout()  # Automatically adjusts subplot parameters to give specified padding

# Save the plot with 600 DPI resolution
plt.savefig('tf_idf_cluster.png', dpi=600, bbox_inches='tight')

## Checking specific cluster values and their data manually

In [None]:
cluster_df = df[df['Cluster'] == 30]

In [None]:
cluster_df

## Intra cluster similarities 

In [None]:
# Initialize a list to hold intra-cluster similarities
intra_cluster_similarities = []

# Get unique cluster labels
unique_clusters = np.unique(labels)

for cluster in unique_clusters:
    # Get the indices of the data points in the current cluster
    cluster_indices = np.where(labels == cluster)[0]
    
    if len(cluster_indices) > 1:  # Only compute if more than one point
        # Extract the TF-IDF matrix for the current cluster
        cluster_tfidf_matrix = tfidf_matrix[cluster_indices]
        
        # Compute the cosine similarity matrix for the current cluster
        cluster_sim_matrix = cosine_similarity(cluster_tfidf_matrix)
        
        # Get the number of points in the cluster
        num_points = len(cluster_indices)
        
        # Compute the average similarity within the cluster
        # Exclude the diagonal (self-similarity) from the average
        avg_similarity = (np.sum(cluster_sim_matrix) - num_points) / (num_points * (num_points - 1))
        
        intra_cluster_similarities.append(avg_similarity)
    else:
        # Not enough points to calculate
        intra_cluster_similarities.append(np.nan)

# Calculate the overall intra-cluster similarity
# Filter out NaN values (clusters with only one point)
valid_similarities = [sim for sim in intra_cluster_similarities if not np.isnan(sim)]
overall_intra_cluster_similarity = np.mean(valid_similarities) if valid_similarities else np.nan

# Print the overall intra-cluster similarity
print(f"\nOverall Intra-Cluster Similarity: {overall_intra_cluster_similarity:.4f}")

## Silhouette Score

In [None]:
from sklearn.metrics import silhouette_score

# Assuming `embeddings` is your feature matrix and `labels` are your cluster labels
silhouette_avg = silhouette_score(tfidf_matrix, labels)

# Print the Silhouette Score
print(f"Silhouette Score: {silhouette_avg:.4f}")


In [None]:
from sklearn.metrics import davies_bouldin_score

# Convert sparse matrix to dense
tfidf_dense = tfidf_matrix.toarray()

# Compute the Davies-Bouldin Index
dbi = davies_bouldin_score(tfidf_dense, labels)

# Print the Davies-Bouldin Index
print(f"Davies-Bouldin Index: {dbi:.4f}")


In [None]:
data['Combined'] = df['Skillset']

In [None]:
data['Cluster']=df['Cluster']

In [None]:
data

## Recommendation function 

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def recommend_profiles_by_topic_skills(user_index, df, top_n):
    """
    Recommends the top N profiles to a target profile based on 'Topic' and 'Skills' columns.

    Parameters:
    - user_index (int): The index of the target profile in the DataFrame.
    - df (pandas.DataFrame): The DataFrame containing profile data.
    - top_n (int): The number of top profiles to recommend.

    Returns:
    - list of tuples: Each tuple contains the index and similarity score of a recommended profile.
    """
    # Combine 'Topic' and 'Skills' into a single text representation for each profile
    
    # Create a TF-IDF vectorizer and transform the combined text data
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(data['Combined'])
    
    # Get the TF-IDF vector for the target profile
    target_vector = tfidf_matrix[user_index]
    
    # Compute cosine similarities between the target profile and all other profiles
    similarities = cosine_similarity(target_vector, tfidf_matrix).flatten()
    
    # Exclude the target profile itself by setting its similarity score to -1
    similarities[user_index] = -1
    
    # Get the indices of the top N most similar profiles
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    # Create a list of tuples with index and similarity score
    recommendations = [(idx, similarities[idx]) for idx in top_indices]
    
    return recommendations

def print_profile_details(profile_index, df):
    """
    Prints the details of a specific profile.

    Parameters:
    - profile_index (int): The index of the profile in the DataFrame.
    - df (pandas.DataFrame): The DataFrame containing profile data.
    """
    profile_details = df.iloc[profile_index]
    print(f"Details of Target Profile (Index: {profile_index}):")
    print(f"Name: {profile_details['Name']}")
    print(f"Email: {profile_details['Email']}")
    print(f"Skills: {profile_details['Skills']}")
    print(f"Domain Interest: {profile_details['Topic']}")
    print(f"Cluster: {profile_details['Cluster']}")
    print("-" * 40)


def print_recommendation_details(recommendations, df):
    """
    Prints the details of the recommended profiles.

    Parameters:
    - recommendations (list of tuples): Each tuple contains the index and similarity score of a recommended profile.
    - df (pandas.DataFrame): The DataFrame containing profile data.
    """
    for idx, score in recommendations:
        profile_details = data.iloc[idx]
        print(f"Index: {idx}")
        print(f"Similarity Score: {score:.4f}")
        print(f"Name: {profile_details['Name']}")
        print(f"Email: {profile_details['Email']}")
        print(f"Skills: {profile_details['Skills']}")
        print(f"Domain Interest: {profile_details['Topic']}")
        print(f"Cluster: {profile_details['Cluster']}")
        print("-" * 40)
        
target_profile_index = 575  # Replace with the index of the target profile
print_profile_details(target_profile_index, data)
top_n_recommendations = recommend_profiles_by_topic_skills(target_profile_index, data, top_n=10)
print_recommendation_details(top_n_recommendations, data)


In [None]:
def calculate_precision_for_all_profiles(df, top_n=10, similarity_threshold=0.90):
    """
    Calculate the precision across all profiles based on a similarity threshold.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing profile data.
    - top_n (int): The number of top profiles to recommend.
    - similarity_threshold (float): The threshold above which a profile is considered relevant.

    Returns:
    - float: Average precision across all profiles.
    """
    total_precision = 0
    count_profiles = len(df)  # Count of total profiles to iterate over

    for user_index in range(count_profiles):
        # Get top N recommendations for the current profile
        recommendations = recommend_profiles_by_topic_skills(user_index, df, top_n)

        # Extract similarity scores
        similarity_scores = [score for idx, score in recommendations]

        # Count how many recommended profiles are above the similarity threshold
        relevant_recommendations = sum(score >= similarity_threshold for score in similarity_scores)

        # Calculate precision for this profile
        precision = relevant_recommendations / top_n if top_n > 0 else 0
        
        # Accumulate the precision for averaging later
        total_precision += precision

    # Calculate average precision
    average_precision = total_precision / count_profiles if count_profiles > 0 else 0
    return average_precision

# Example Usage
final_precision = calculate_precision_for_all_profiles(data)
print(f"Final Average Precision across all profiles: {final_precision:.4f}")


In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

def generate_keyword_cloud(df, column, title):
    """
    Generates and plots a word cloud for a specific column in the DataFrame.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing profile data.
    - column (str): The column name for which the word cloud is to be generated.
    - title (str): The title for the word cloud plot.
    """
    # Combine all text in the specified column
    text = " ".join(df[column].dropna().astype(str).values)
    
    # Generate the word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    
    # Plot the word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title, fontsize=20)
    plt.axis('off')
    plt.show()

# Generate and plot the keyword cloud for 'Skills'
generate_keyword_cloud(data, 'Skills', 'Keyword Cloud for Skills')