In [1]:
import pandas as pd

# Load the CSV file into a DataFrame
file_path = 'final_manglish_transliterated.csv'
df = pd.read_csv(file_path)

# Print relevant properties of the dataset for presentation
print("Dataset Properties:")
print("\n1. Column Names:")
print(df.columns.tolist())

print("\n2. Data Types:")
print(df.dtypes)

print("\n3. Number of Rows and Columns:")
print(df.shape)

print("\n4. Sample Data:")
print(df.head())

# Additional properties for numerical columns (if applicable)
numerical_columns = df.select_dtypes(include=['number']).columns.tolist()
if numerical_columns:
    print("\n5. Numerical Data Statistics:")
    print(df.describe())

# Additional properties for categorical columns (if applicable)
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
if categorical_columns:
    for col in categorical_columns:
        print(f"\n6. Unique values in '{col}':")
        print(df[col].unique())

# Additional properties for date/time columns (if applicable)
datetime_columns = df.select_dtypes(include=['datetime64']).columns.tolist()
if datetime_columns:
    print("\n7. Date/Time Data Information:")
    for col in datetime_columns:
        print(f"Column '{col}':")
        print(f"- Minimum Date: {df[col].min()}")
        print(f"- Maximum Date: {df[col].max()}")
        print(f"- Date Range: {df[col].max() - df[col].min()}")

# Additional properties for missing values
missing_values_count = df.isnull().sum()
if not missing_values_count.empty:
    print("\n8. Missing Values Information:")
    print(missing_values_count[missing_values_count > 0])

# Additional properties for unique identifier columns (if applicable)
unique_identifier_columns = ['link']  # Adjust based on your dataset
if unique_identifier_columns:
    print("\n9. Unique Identifier Columns Information:")
    for col in unique_identifier_columns:
        print(f"Column '{col}':")
        print(f"- Unique Values Count: {df[col].nunique()}")

# Add more properties as needed for your specific dataset

# Calculate maximum and minimum length of sentences in 'commentText'
max_sentence_length = df['commentText'].str.len().max()
min_sentence_length = df['commentText'].str.len().min()

print(f"\n10. Maximum Length of Sentence in 'commentText': {max_sentence_length} characters")
print(f"11. Minimum Length of Sentence in 'commentText': {min_sentence_length} characters")


Column Names:
['product_category', 'product_name', 'link', 'commentDate', 'commentText', 'Sentiment_Class', 'transliterated_text']

Data Types:
product_category       object
product_name           object
link                   object
commentDate            object
commentText            object
Sentiment_Class        object
transliterated_text    object
dtype: object

Number of Rows and Columns:
(18794, 7)

Sample Data:
  product_category   product_name  \
0           Mobile  iphone 13 pro   
1           Mobile  iphone 13 pro   
2           Mobile  iphone 13 pro   
3           Mobile  iphone 13 pro   
4           Mobile  iphone 13 pro   

                                          link          commentDate  \
0  https://www.youtube.com/watch?v=NUu8V5z1l9g  2023-01-18 18:46:57   
1  https://www.youtube.com/watch?v=NUu8V5z1l9g  2022-11-04 18:19:46   
2  https://www.youtube.com/watch?v=NUu8V5z1l9g  2022-09-30 22:43:30   
3  https://www.youtube.com/watch?v=NUu8V5z1l9g  2022-08-23 12:30:39   


In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Load the CSV file into a DataFrame
file_path = 'final_manglish_transliterated.csv'
df = pd.read_csv(file_path)

# Number of entries
num_entries = len(df)

# Features and feature names
num_features = len(df.columns)
feature_names = df.columns.tolist()

# Number of classes and their names
num_classes = df['Sentiment_Class'].nunique()
class_names = df['Sentiment_Class'].unique()

# Number of sentences, word tokens, vocabulary size, and average sentence length
num_sentences = len(df['commentText'])
all_tokens = [word_tokenize(sent) for sent in df['commentText']]
num_word_tokens = sum(len(tokens) for tokens in all_tokens)
vocab_size = len(set(token.lower() for tokens in all_tokens for token in tokens))
avg_sentence_length = num_word_tokens / num_sentences

# Print the calculated properties
print(f"Number of Entries: {num_entries}")
print(f"Number of Features: {num_features}")
print(f"Feature Names: {feature_names}")
print(f"Number of Classes: {num_classes}")
print(f"Class Names: {class_names}")
print(f"Number of Sentences: {num_sentences}")
print(f"Number of Word Tokens: {num_word_tokens}")
print(f"Vocabulary Size: {vocab_size}")
print(f"Average Sentence Length: {avg_sentence_length:.2f} words per sentence")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Number of Entries: 18794
Number of Features: 7
Feature Names: ['product_category', 'product_name', 'link', 'commentDate', 'commentText', 'Sentiment_Class', 'transliterated_text']
Number of Classes: 5
Class Names: ['Positive' 'Neutral' 'Not_relevant' 'Negative' 'Mixed Feelings']
Number of Sentences: 18794
Number of Word Tokens: 283016
Vocabulary Size: 40941
Average Sentence Length: 15.06 words per sentence


In [4]:
import pandas as pd

# Load the CSV file into a DataFrame
file_path = 'final_manglish_transliterated.csv'
df = pd.read_csv(file_path)

# Calculate class distribution
class_distribution = df['Sentiment_Class'].value_counts()

# Print the class distribution
print("Class Distribution:")
for sentiment_class, count in class_distribution.items():
    print(f"- {sentiment_class}: {count} instances")


Class Distribution:
- Neutral: 6384 instances
- Not_relevant: 6287 instances
- Positive: 2946 instances
- Negative: 2594 instances
- Mixed Feelings: 583 instances


In [6]:
import pandas as pd

# Load the CSV file into a DataFrame
file_path = 'final_manglish_transliterated.csv'
df = pd.read_csv(file_path)

# Find the longest sentence and its count
longest_sentence = df['commentText'].iloc[df['commentText'].str.len().idxmax()]
longest_sentence_count = df['commentText'].str.count(longest_sentence).sum()
longest_sentence_length = len(longest_sentence.split())

# Print the longest sentence, its count, and length
print(f"Longest Sentence:")
print(longest_sentence)
print(f"\nLength of Longest Sentence: {longest_sentence_length} words")
print(f"Count of Longest Sentence: {longest_sentence_count} occurrences")


Longest Sentence:
ഐഫോൺ 12 /12പ്രൊ മാക്സ് HDR 10 bit Dolby vision വീഡിയോ റെക്കോർഡിങ്നെ കുറിച്ചു നമ്മൾ ചില കാര്യങ്ങൾ മനസിലാക്കേണ്ടത്ഒരു നോർമൽ കസ്റ്റമർ വെച്ച് നോക്കുവാണേൽ HDR Dolby vision വലിയ സംഭവമായി തോന്നില്ലപക്ഷെ വീഡിയോഗ്രാഫി il, ഉള്ളവർക്ക് അലെൽ സിനിമ ഫീൽഡ് ഉള്ളവർക്ക് ഹ്യൂജ് അച്ചീവേമെന്റ് ആണ്കരണo(Worlds Most advanced dynamic HDR ആണ് dolby Vision )Dolby Vision special is its use of dynamic metadata. Unlike HDR10, ആൻഡ് HDR 10 plus which is an open format, Dolby Vision uses dynamic metadata to tone-map the image on a scene-by-scene or frame-by-frame basisഇന്ന് real time 10 bit HDR dolby visionൽ റെക്കോർഡ് ചെയ്യാൻ പറ്റുന്ന ഒരു സിനിമ ക്യാമറ/ പ്രഫഷണൽ ക്യാമറ ഇല്ലഅപ്പോൾ നിങ്ങൾ ഓർക്കും dolby വിഷൻ ഇൽ സിനിമ ഉണ്ടാലോ എന്ന് ശരിയാണ് പക്ഷെ സിനിമ ക്യാമറ പോലും വീഡിയോ എടുത്തു പിന്നിട് എഡിറ്റിംഗ് സോഫ്റ്റ്‌വെയർ ഇൽ ഓരോ ഫ്രെയിം കളർ ഗ്രേഡ്,ചെയ്‌താണ് dolby vision HDR ആകുന്നത്*ഇവിടെയാണ് നമ്മൾ ഐഫോൺ 12 പ്രൊ /മാക്സ്ന്റെവീഡിയോ റെക്കോർഡ് മികവ് മനസിലാക്കേണ്ടത്(Iphone 12 pro/ pro max First camera ever to record HDR 10

In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.utils import resample

# Load data from CSV
file_path = 'final_manglish_transliterated.csv'
data = pd.read_csv(file_path)

# Ensure 'commentText' column contains strings
data['transliterated_text'] = data['transliterated_text'].astype(str)

# Function to convert sentiment labels to numerical values
sentiment_dict = {
    'Positive': 0,
    'Negative': 1,
    'Not_relevant': 2,
    'Mixed Feelings': 3,
    'Neutral': 4
}

data['Sentiment_Class'] = data['Sentiment_Class'].map(sentiment_dict)

# Upsample minority classes to match the size of the majority class
data_upsampled = pd.concat([
    resample(data[data['Sentiment_Class'] == sentiment_dict['Positive']], replace=True, n_samples=len(data[data['Sentiment_Class'] == sentiment_dict['Not_relevant']])),
    resample(data[data['Sentiment_Class'] == sentiment_dict['Negative']], replace=True, n_samples=len(data[data['Sentiment_Class'] == sentiment_dict['Not_relevant']])),
    resample(data[data['Sentiment_Class'] == sentiment_dict['Not_relevant']], replace=True, n_samples=len(data[data['Sentiment_Class'] == sentiment_dict['Not_relevant']])),
    resample(data[data['Sentiment_Class'] == sentiment_dict['Mixed Feelings']], replace=True, n_samples=len(data[data['Sentiment_Class'] == sentiment_dict['Not_relevant']])),
    resample(data[data['Sentiment_Class'] == sentiment_dict['Neutral']], replace=True, n_samples=len(data[data['Sentiment_Class'] == sentiment_dict['Not_relevant']]))
])

# Print class distribution after oversampling
print(data_upsampled['Sentiment_Class'].value_counts())


Sentiment_Class
0    6287
1    6287
2    6287
3    6287
4    6287
Name: count, dtype: int64
