In [None]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.17.0 textstat-0.7.4


In [None]:
import pandas as pd
import spacy
from textblob import TextBlob

# Load SpaCy model for POS tagging and named entity recognition
nlp = spacy.load('en_core_web_sm')

# Function to extract additional features
def extract_features(sentence):
    doc = nlp(sentence)

    # POS counts
    pos_counts = {pos: 0 for pos in ['NOUN', 'VERB', 'ADJ', 'ADV']}
    for token in doc:
        if token.pos_ in pos_counts:
            pos_counts[token.pos_] += 1

    # Dependency tree depth (as an example of syntactic feature)
    dependency_depth = max([token.dep_ for token in doc], default=0)

    # Example features: Sentence length, POS counts, sentiment
    features = {
        'num_nouns': pos_counts['NOUN'],
        'num_verbs': pos_counts['VERB'],
        'num_adjectives': pos_counts['ADJ'],
        'num_adverbs': pos_counts['ADV'],
    }
    return features

# Load your dataset
file_path = 'paraphrases_output.xlsx'
df = pd.read_excel(file_path)

# Apply the feature extraction to each sentence
df_features = df['Sentence'].apply(extract_features).apply(pd.Series)

# Combine the new features with the original dataset
df_combined = pd.concat([df, df_features], axis=1)

# Save the enriched dataset to a new Excel file
df_combined.to_excel('Enriched_Dataset.xlsx', index=False)

print("Features added successfully and saved to Enriched_Dataset.xlsx")


Features added successfully and saved to Enriched_Dataset.xlsx


In [None]:

import pandas as pd
import textstat
import spacy
from nltk import tokenize

# Load your dataset
file_path = 'Enriched_Dataset.xlsx'  # Update the path if necessary
df = pd.read_excel(file_path)

# Load Spacy's English model
nlp = spacy.load('en_core_web_sm')

# Define a function to calculate readability scores
def calculate_readability(sentence):
    flesch_kincaid_grade = textstat.flesch_kincaid_grade(sentence)
    gunning_fog = textstat.gunning_fog(sentence)
    smog_index = textstat.smog_index(sentence)
    return flesch_kincaid_grade, gunning_fog



# Add new columns to the DataFrame
df['Flesch_Kincaid_Grade'], df['Gunning_Fog'] = zip(*df['Sentence'].apply(calculate_readability))


# Save the enriched dataset to a new Excel file
df.to_excel('Enriched_Dataset_with_Complexity.xlsx', index=False)

print("Features added and dataset saved as 'Enriched_Dataset_with_Complexity.xlsx'.")


Features added and dataset saved as 'Enriched_Dataset_with_Complexity.xlsx'.


In [None]:
import pandas as pd
import spacy

# Load Spacy model
nlp = spacy.load('en_core_web_sm')

# Load the enriched dataset
file_path = 'Enriched_Dataset_with_Complexity.xlsx'
df = pd.read_excel(file_path)

# Function to calculate dependency tree features
def calculate_dependency_features(sentence):
    doc = nlp(sentence)



    # Count the number of dependent tokens
    num_dependents = sum([1 for token in doc if token.dep_ != 'ROOT'])

    # Count different dependency labels
    dep_labels = [token.dep_ for token in doc]
    dep_label_counts = {label: dep_labels.count(label) for label in set(dep_labels)}

    return num_dependents, dep_label_counts

# Apply the function to the dataset and extract specific features
df['Num_Dependents'], df['Dep_Label_Counts'] = zip(*df['Sentence'].apply(calculate_dependency_features))

# Save the enriched dataset to a new Excel file
df.to_excel('Enriched_Dataset_with_Dependency_Structure.xlsx', index=False)

print("Dependency structure features added successfully!")


Dependency structure features added successfully!


In [None]:
import pandas as pd
import ast

# Load the enriched dataset
file_path = 'Enriched_Dataset_with_Dependency_Structure.xlsx'
df = pd.read_excel(file_path)

# Convert the string representation of dictionaries into actual dictionaries
df['Dep_Label_Counts'] = df['Dep_Label_Counts'].apply(ast.literal_eval)

# Create separate columns for each dependency label
dep_label_df = df['Dep_Label_Counts'].apply(pd.Series).fillna(0)

# Rename the columns to avoid any conflicts
dep_label_df.columns = [f'Dep_{col}' for col in dep_label_df.columns]

# Drop the original 'Dep_Label_Counts' column and concatenate the new columns
df = df.drop(columns=['Dep_Label_Counts'])
df = pd.concat([df, dep_label_df], axis=1)

# Save the enriched dataset to a new Excel file
df.to_excel('Enriched_Dataset.xlsx', index=False)

print("Dependency labels split into separate columns successfully!")


Dependency labels split into separate columns successfully!


In [None]:

import spacy
from spacy import displacy

# Load the spaCy model for English
nlp = spacy.load('en_core_web_sm')

# Define the sentence
sentence = "Operating systems provide a graphical user interface for ease of use."

# Process the sentence using spaCy
doc = nlp(sentence)

# Render the dependency tree
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})
