In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [11]:
import pandas as pd
import collections

# File path to the CSV file
file_path = "C:/Users/Suhas sattigeri/Desktop/Mini P/data/dataset2.csv"

# Load the dataset with utf-8 encoding
df = pd.read_csv(file_path, encoding='utf-8')

# Remove null values for the "Text" column
df.dropna(subset=['Text'], inplace=True)

# Convert the column "Text" to string type
df['Text'] = df['Text'].astype(str)

# Convert the column "Language" to string type
df['Language'] = df['Language'].astype(str)

# Define punctuation and vowels
punc = ('.', ',', '!', '?', ';', ':', '-', '(', ')', '[', ']', '{', '}', "'", '"')
vowels = 'AEIOUaeiou'

# Feature engineering
df['word_count'] = df['Text'].apply(lambda x: len(x.split()))
df['character_count'] = df['Text'].apply(lambda x: len(x.replace(" ", "")))
df['word_density'] = df['word_count'] / (df['character_count'] + 1)
df['punc_count'] = df['Text'].apply(lambda x: len([a for a in x if a in punc]))
df['num_vowels'] = df['Text'].apply(lambda x: sum([1 for a in x if a in vowels]))
df['vowel_density'] = df['num_vowels'] / df['word_count']
df['num_exclamation_marks'] = df['Text'].apply(lambda x: x.count('!'))
df['num_question_marks'] = df['Text'].apply(lambda x: x.count('?'))
df['num_punctuation'] = df['Text'].apply(lambda x: sum(x.count(w) for w in punc))
df['num_unique_words'] = df['Text'].apply(lambda x: len(set(w for w in x.split())))
df['num_repeated_words'] = df['Text'].apply(lambda x: len([w for w in collections.Counter(x.split()).values() if w > 1]))
df['words_vs_unique'] = df['num_unique_words'] / df['word_count']

# Display the first few rows of the dataframe to check the new features
print(df.head())


                   Text Language  word_count  character_count  word_density  \
0             தமிழ்நாடு    Tamil           1                9      0.100000   
1  செய்தி தமிழ் இது ஒரு    Tamil           4               17      0.222222   
2                நன்றி!    Tamil           1                6      0.142857   
3              வணக்கம்!    Tamil           1                8      0.111111   
4           மொழி தமிழ்?    Tamil           2               10      0.181818   

   punc_count  num_vowels  vowel_density  num_exclamation_marks  \
0           0           0            0.0                      0   
1           0           0            0.0                      0   
2           1           0            0.0                      1   
3           1           0            0.0                      1   
4           1           0            0.0                      0   

   num_question_marks  num_punctuation  num_unique_words  num_repeated_words  \
0                   0                0    

In [12]:
import numpy as np
# Ensure all columns used for mean calculation are numerical
numeric_columns = df.select_dtypes(include=[np.number]).columns

# Group by 'language' and calculate the mean, then transpose the result
mean_by_language = df.groupby('Language')[numeric_columns].mean().T

# Display the transposed result
print(mean_by_language)


# Select only numeric columns
numeric_df = df.select_dtypes(include=[np.number])

# Calculate the Pearson correlation matrix for numeric columns
correlation_matrix = numeric_df.corr(method='pearson')

# Display the correlation matrix
print(correlation_matrix)

Language                 Kannada      Tamil     Telugu
word_count              1.895091   2.042764   2.072079
character_count        10.073040  12.129875  11.034653
word_density            0.173298   0.160330   0.176511
punc_count              0.397862   0.394179   0.395644
num_vowels              0.000000   0.000000   0.000000
vowel_density           0.000000   0.000000   0.000000
num_exclamation_marks   0.200911   0.192239   0.204356
num_question_marks      0.196952   0.201940   0.191287
num_punctuation         0.397862   0.394179   0.395644
num_unique_words        1.895091   2.042764   2.072079
num_repeated_words      0.000000   0.000000   0.000000
words_vs_unique         1.000000   1.000000   1.000000
                       word_count  character_count  word_density  punc_count  \
word_count               1.000000         0.659805      0.468209    0.316843   
character_count          0.659805         1.000000     -0.300911    0.303741   
word_density             0.468209        -0.3

In [13]:
import pandas as pd
# Split the dataset into features and target variable
# Load the dataset with utf-8 encoding
df = pd.read_csv(file_path, encoding='utf-8')

# Remove null values for the "text" column
df.dropna(subset=['Text'], inplace=True)

# Convert the column "text" to string type
df['text'] = df['Text'].astype(str)

# Convert the column "language" to string type
df['Language'] = df['Language'].astype(str)

# Remove rows where the text is empty or only whitespace
df = df[df['Text'].str.strip() != '']

# Features and labels
X = df['Text']
y = df['Language']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [14]:
# Create and fit the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred_nb = nb_model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report:")
print(classification_report(y_test, y_pred_nb))



Accuracy: 0.7733421313097988
Classification Report:
              precision    recall  f1-score   support

     Kannada       1.00      0.86      0.92      1039
       Tamil       0.58      1.00      0.73       944
      Telugu       1.00      0.49      0.65      1048

    accuracy                           0.77      3031
   macro avg       0.86      0.78      0.77      3031
weighted avg       0.87      0.77      0.77      3031



In [15]:
# Train a Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred_nb = nb_model.predict(X_test_tfidf)

# Evaluate the model
print("Naive Bayes Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


Naive Bayes Classifier
Accuracy: 0.7733421313097988
              precision    recall  f1-score   support

     Kannada       1.00      0.86      0.92      1039
       Tamil       0.58      1.00      0.73       944
      Telugu       1.00      0.49      0.65      1048

    accuracy                           0.77      3031
   macro avg       0.86      0.78      0.77      3031
weighted avg       0.87      0.77      0.77      3031



In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Train a Decision Tree classifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred_dt = dt_model.predict(X_test_tfidf)

# Evaluate the model
print("\nDecision Tree Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Classification Report:")
print(classification_report(y_test, y_pred_dt))



Decision Tree Classifier


Accuracy: 0.9313757835697789
Classification Report:
              precision    recall  f1-score   support

     Kannada       1.00      0.86      0.92      1039
       Tamil       1.00      0.94      0.97       944
      Telugu       0.83      1.00      0.91      1048

    accuracy                           0.93      3031
   macro avg       0.94      0.93      0.93      3031
weighted avg       0.94      0.93      0.93      3031



In [19]:
# Example new text data
new_texts = ["ಕನ್ನಡವೇನು", "தமிழ்நாடு", "తెలుగు భాష "]  # Ensure texts are representative

# Convert the new text data to a DataFrame
new_df = pd.DataFrame(new_texts, columns=['Text'])

# Preprocess the new text data (consistent with training preprocessing)
new_df['Text'] = new_df['Text'].str.lower().str.replace("[^\w\s]", "", regex=True)
new_df = new_df[new_df['Text'].str.strip() != '']  # Keep non-empty texts only

# Transform the new text data using the same TF-IDF vectorizer
new_texts_tfidf = vectorizer.transform(new_df['Text'])

# Predict using Naive Bayes model
nb_predictions = nb_model.predict(new_texts_tfidf)

# Predict using Decision Tree model
dt_predictions = dt_model.predict(new_texts_tfidf)

# Print the predictions
for i, text in enumerate(new_texts):
    print(f"Text: {text.strip()}")
    print(f"Naive Bayes Prediction: {nb_predictions[i]}")
    print(f"Decision Tree Prediction: {dt_predictions[i]}")
    print()


Text: ಕನ್ನಡವೇನು
Naive Bayes Prediction: Tamil
Decision Tree Prediction: Telugu

Text: தமிழ்நாடு
Naive Bayes Prediction: Tamil
Decision Tree Prediction: Telugu

Text: తెలుగు భాష
Naive Bayes Prediction: Tamil
Decision Tree Prediction: Telugu



In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import collections
import numpy as np

# Load the dataset with utf-8 encoding
file_path = 'C:/Users/Suhas sattigeri/Desktop/Mini P/data/dataset2.csv'
df = pd.read_csv(file_path, encoding='utf-8')

# Remove null values for the "Text" column
df.dropna(subset=['Text'], inplace=True)

# Convert the column "Text" to string type
df['Text'] = df['Text'].astype(str)

# Convert the column "Language" to string type
df['Language'] = df['Language'].astype(str)

# Preprocess the text (lowercase and remove punctuation)
df['Text'] = df['Text'].str.lower().str.replace("[^\w\s]", "", regex=True)

# Remove rows with empty text after preprocessing
df = df[df['Text'].str.strip() != '']

# Feature engineering
df['word_count'] = df['Text'].apply(lambda x: len(x.split()))
df['character_count'] = df['Text'].apply(lambda x: len(x.replace(" ", "")))
df['word_density'] = df['word_count'] / (df['character_count'] + 1)
df['punc_count'] = df['Text'].apply(lambda x: len([a for a in x if a in '.,!?;:()[]{}\'"']))
df['num_vowels'] = df['Text'].apply(lambda x: sum([1 for a in x if a in 'aeiou']))
df['vowel_density'] = df['num_vowels'] / df['word_count']
df['num_unique_words'] = df['Text'].apply(lambda x: len(set(x.split())))

# Split the dataset into training and test sets
X = df['Text']
y = df['Language']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Train a Decision Tree classifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred_nb = nb_model.predict(X_test_tfidf)
y_pred_dt = dt_model.predict(X_test_tfidf)

# Evaluate the models
print("Naive Bayes Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

print("\nDecision Tree Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

# Example new text data
new_texts = ["ಕನ್ನಡವೇನು", "தமிழ்நாடு", "తెలుగు భాష "]  # Ensure texts are representative

# Convert the new text data to a DataFrame
new_df = pd.DataFrame(new_texts, columns=['Text'])

# Preprocess the new text data (consistent with training preprocessing)
new_df['Text'] = new_df['Text'].str.lower().str.replace("[^\w\s]", "", regex=True)
new_df = new_df[new_df['Text'].str.strip() != '']  # Keep non-empty texts only

# Transform the new text data using the same TF-IDF vectorizer
new_texts_tfidf = vectorizer.transform(new_df['Text'])

# Predict using Naive Bayes model
nb_predictions = nb_model.predict(new_texts_tfidf)

# Predict using Decision Tree model
dt_predictions = dt_model.predict(new_texts_tfidf)

# Print the predictions
for i, text in enumerate(new_texts):
    print(f"Text: {text.strip()}")
    print(f"Naive Bayes Prediction: {nb_predictions[i]}")
    print(f"Decision Tree Prediction: {dt_predictions[i]}")
    print()


Naive Bayes Classifier
Accuracy: 1.0
              precision    recall  f1-score   support

     Kannada       1.00      1.00      1.00      1039
       Tamil       1.00      1.00      1.00       944
      Telugu       1.00      1.00      1.00      1048

    accuracy                           1.00      3031
   macro avg       1.00      1.00      1.00      3031
weighted avg       1.00      1.00      1.00      3031


Decision Tree Classifier
Accuracy: 1.0
              precision    recall  f1-score   support

     Kannada       1.00      1.00      1.00      1039
       Tamil       1.00      1.00      1.00       944
      Telugu       1.00      1.00      1.00      1048

    accuracy                           1.00      3031
   macro avg       1.00      1.00      1.00      3031
weighted avg       1.00      1.00      1.00      3031

Text: ಕನ್ನಡವೇನು
Naive Bayes Prediction: Kannada
Decision Tree Prediction: Kannada

Text: தமிழ்நாடு
Naive Bayes Prediction: Tamil
Decision Tree Prediction: Tamil


In [14]:
for i, text in enumerate(new_texts):
    print(f"Text: {text.strip()}")
    print(f"Naive Bayes Prediction: {nb_predictions[i]}")
    print(f"Decision Tree Prediction: {dt_predictions[i]}")
    print()

Text: ಕನ್ನಡವೇನು
Naive Bayes Prediction: Kannada
Decision Tree Prediction: Kannada

Text: தமிழ்நாடு
Naive Bayes Prediction: Tamil
Decision Tree Prediction: Tamil

Text: తెలుగు భాష
Naive Bayes Prediction: Telugu
Decision Tree Prediction: Telugu

