In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [18]:
import pandas as pd
import collections

# File path to the CSV file
file_path = "C:/Users/Suhas sattigeri/Desktop/Mini P/data/dataset1.csv"

# Load the dataset with utf-8 encoding
df = pd.read_csv(file_path, encoding='utf-8')

# Remove null values for the "Text" column
df.dropna(subset=['Text'], inplace=True)

# Convert the column "Text" to string type
df['Text'] = df['Text'].astype(str)

# Convert the column "Language" to string type
df['Language'] = df['Language'].astype(str)

# Define punctuation and vowels
punc = ('.', ',', '!', '?', ';', ':', '-', '(', ')', '[', ']', '{', '}', "'", '"')
vowels = 'AEIOUaeiou'

# Feature engineering
df['word_count'] = df['Text'].apply(lambda x: len(x.split()))
df['character_count'] = df['Text'].apply(lambda x: len(x.replace(" ", "")))
df['word_density'] = df['word_count'] / (df['character_count'] + 1)
df['punc_count'] = df['Text'].apply(lambda x: len([a for a in x if a in punc]))
df['num_vowels'] = df['Text'].apply(lambda x: sum([1 for a in x if a in vowels]))
df['vowel_density'] = df['num_vowels'] / df['word_count']
df['num_exclamation_marks'] = df['Text'].apply(lambda x: x.count('!'))
df['num_question_marks'] = df['Text'].apply(lambda x: x.count('?'))
df['num_punctuation'] = df['Text'].apply(lambda x: sum(x.count(w) for w in punc))
df['num_unique_words'] = df['Text'].apply(lambda x: len(set(w for w in x.split())))
df['num_repeated_words'] = df['Text'].apply(lambda x: len([w for w in collections.Counter(x.split()).values() if w > 1]))
df['words_vs_unique'] = df['num_unique_words'] / df['word_count']

# Display the first few rows of the dataframe to check the new features
print(df.head())


                   Text Language  word_count  character_count  word_density  \
0             தமிழ்நாடு    Tamil           1                9      0.100000   
1  செய்தி தமிழ் இது ஒரு    Tamil           4               17      0.222222   
2                நன்றி!    Tamil           1                6      0.142857   
3              வணக்கம்!    Tamil           1                8      0.111111   
4           மொழி தமிழ்?    Tamil           2               10      0.181818   

   punc_count  num_vowels  vowel_density  num_exclamation_marks  \
0           0           0            0.0                      0   
1           0           0            0.0                      0   
2           1           0            0.0                      1   
3           1           0            0.0                      1   
4           1           0            0.0                      0   

   num_question_marks  num_punctuation  num_unique_words  num_repeated_words  \
0                   0                0    

In [19]:
import numpy as np
# Ensure all columns used for mean calculation are numerical
numeric_columns = df.select_dtypes(include=[np.number]).columns

# Group by 'language' and calculate the mean, then transpose the result
mean_by_language = df.groupby('Language')[numeric_columns].mean().T

# Display the transposed result
print(mean_by_language)


# Select only numeric columns
numeric_df = df.select_dtypes(include=[np.number])

# Calculate the Pearson correlation matrix for numeric columns
correlation_matrix = numeric_df.corr(method='pearson')

# Display the correlation matrix
print(correlation_matrix)

Language                 Kannada      Tamil     Telugu
word_count              2.239515   2.239594   2.280000
character_count        11.596986  13.651262  11.455082
word_density            0.183448   0.158395   0.191060
punc_count              0.658585   0.652573   0.655082
num_vowels              0.000000   0.000000   0.000000
vowel_density           0.000000   0.000000   0.000000
num_exclamation_marks   0.332569   0.318256   0.338361
num_question_marks      0.326016   0.334317   0.316721
num_punctuation         0.658585   0.652573   0.655082
num_unique_words        2.239515   2.239594   2.280000
num_repeated_words      0.000000   0.000000   0.000000
words_vs_unique         1.000000   1.000000   1.000000
                       word_count  character_count  word_density  punc_count  \
word_count               1.000000         0.570438      0.341059    0.005647   
character_count          0.570438         1.000000     -0.514231    0.099512   
word_density             0.341059        -0.5

In [20]:
import pandas as pd
# Split the dataset into features and target variable
# Load the dataset with utf-8 encoding
df = pd.read_csv(file_path, encoding='utf-8')

# Remove null values for the "text" column
df.dropna(subset=['Text'], inplace=True)

# Convert the column "text" to string type
df['text'] = df['Text'].astype(str)

# Convert the column "language" to string type
df['Language'] = df['Language'].astype(str)

# Remove rows where the text is empty or only whitespace
df = df[df['Text'].str.strip() != '']

# Features and labels
X = df['Text']
y = df['Language']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [21]:
# Create and fit the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred_nb = nb_model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report:")
print(classification_report(y_test, y_pred_nb))



Accuracy: 0.8716548334243582
Classification Report:
              precision    recall  f1-score   support

     Kannada       1.00      0.72      0.83       653
       Tamil       1.00      0.92      0.96       605
      Telugu       0.71      1.00      0.83       573

    accuracy                           0.87      1831
   macro avg       0.90      0.88      0.87      1831
weighted avg       0.91      0.87      0.87      1831



In [22]:
# Train a Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred_nb = nb_model.predict(X_test_tfidf)

# Evaluate the model
print("Naive Bayes Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


Naive Bayes Classifier
Accuracy: 0.8716548334243582
              precision    recall  f1-score   support

     Kannada       1.00      0.72      0.83       653
       Tamil       1.00      0.92      0.96       605
      Telugu       0.71      1.00      0.83       573

    accuracy                           0.87      1831
   macro avg       0.90      0.88      0.87      1831
weighted avg       0.91      0.87      0.87      1831



In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Train a Decision Tree classifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred_dt = dt_model.predict(X_test_tfidf)

# Evaluate the model
print("\nDecision Tree Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Classification Report:")
print(classification_report(y_test, y_pred_dt))



Decision Tree Classifier
Accuracy: 0.8716548334243582
Classification Report:
              precision    recall  f1-score   support

     Kannada       1.00      0.72      0.83       653
       Tamil       1.00      0.92      0.96       605
      Telugu       0.71      1.00      0.83       573

    accuracy                           0.87      1831
   macro avg       0.90      0.88      0.87      1831
weighted avg       0.91      0.87      0.87      1831



In [24]:
# classification.py
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the dataset
df = pd.read_csv("C:/Users/Suhas sattigeri/Desktop/Mini P/data/dataset2.csv")

# Preprocess the data
df["Text"] = df["Text"].str.lower().str.replace("[^\w\s]", "", regex=True)

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Split the dataset into training and test sets
X = df["Text"]
y = df["Language"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the vectorizer only on the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data using the already fitted vectorizer
X_test_tfidf = vectorizer.transform(X_test)

# Train Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Train Decision Tree classifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_tfidf, y_train)

# Evaluate models
def evaluate_model(model, X_test_tfidf, y_test):
    predictions = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average="weighted")
    recall = recall_score(y_test, predictions, average="weighted")
    f1 = f1_score(y_test, predictions, average="weighted")
    report = classification_report(y_test, predictions)
    return accuracy, precision, recall, f1, report

nb_accuracy, nb_precision, nb_recall, nb_f1, nb_report = evaluate_model(nb_model, X_test_tfidf, y_test)
dt_accuracy, dt_precision, dt_recall, dt_f1, dt_report = evaluate_model(dt_model, X_test_tfidf, y_test)

# Export models and vectorizer for use in Flask app
def classify_text(text):
    text_tfidf = vectorizer.transform([text])
    nb_prediction = nb_model.predict(text_tfidf)[0]
    dt_prediction = dt_model.predict(text_tfidf)[0]
    
    nb_confidence = nb_model.predict_proba(text_tfidf)[0][
        nb_model.classes_.tolist().index(nb_prediction)
    ]
    
    dt_confidence = None
    if hasattr(dt_model, 'predict_proba'):
        dt_confidence = dt_model.predict_proba(text_tfidf)[0][
            dt_model.classes_.tolist().index(dt_prediction)
        ]
    return {
        "nb_prediction": nb_prediction,
        "dt_prediction": dt_prediction,
        "nb_confidence": nb_confidence,
        "dt_confidence": dt_confidence
    }



In [25]:
from flask import Flask, request, jsonify, render_template
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
import logging
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Ensure the results are consistent by setting seed
DetectorFactory.seed = 0

app = Flask(__name__)

# Set up logging
logging.basicConfig(level=logging.INFO)

# Load the dataset
df = pd.read_csv("C:/Users/Suhas sattigeri/Desktop/Mini P/data/cleaned_dataset1.csv")
df["Text"] = df["Text"].str.lower().str.replace("[^\w\s]", "", regex=True)

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Split the dataset into training and test sets
X = df["Text"]
y = df["Language"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the vectorizer only on the training data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train models
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_tfidf, y_train)

@app.route("/")
def main():
    return render_template("index.html")

@app.route("/predict", methods=["POST"])
def predict():
    if request.method == "POST":
        text = request.get_json().get("text", "")
        if not text:
            return jsonify({"error": "Missing or empty text field"}), 400
        try:
            language = detect(text)
            text_tfidf = vectorizer.transform([text])
            nb_prediction = nb_model.predict(text_tfidf)[0]
            dt_prediction = dt_model.predict(text_tfidf)[0]
            nb_confidence = nb_model.predict_proba(text_tfidf)[0][
                nb_model.classes_.tolist().index(nb_prediction)
            ]
            dt_confidence = dt_model.predict_proba(text_tfidf)[0][
                dt_model.classes_.tolist().index(dt_prediction)
            ]
            return jsonify(
                {
                    "language": language,
                    "nb_prediction": nb_prediction,
                    "dt_prediction": dt_prediction,
                    "nb_confidence": nb_confidence * 100,
                    "dt_confidence": dt_confidence * 100 if dt_confidence is not None else None,
                }
            )
        except LangDetectException as e:
            logging.error(f"Language detection failed: {e}")
            return jsonify({"error": "Could not detect language"}), 500

@app.route("/translate", methods=["POST"])
def translate():
    data = request.get_json()
    text = data.get("text", "")
    target_language = data.get("target_language", "")

    # Replace this with actual translation logic
    translated_text = f"[Translated {text} to {target_language}]"  # Placeholder translation logic

    return jsonify({"translated_text": translated_text})

if __name__ == "__main__":
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [27]:
from flask import Flask, request, jsonify, render_template
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
import logging
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Ensure consistent results by setting seed
DetectorFactory.seed = 0

app = Flask(__name__)

# Set up logging
logging.basicConfig(level=logging.INFO)

# Load and preprocess the dataset
df = pd.read_csv("C:/Users/Suhas sattigeri/Desktop/Mini P/data/dataset2.csv")
df["Text"] = df["Text"].str.lower().str.replace("[^\w\s]", "", regex=True)

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Split the dataset into training and test sets
X = df["Text"]
y = df["Language"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the vectorizer on the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data using the fitted vectorizer
X_test_tfidf = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Train a Decision Tree classifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_tfidf, y_train)

# Evaluate the models
nb_predictions = nb_model.predict(X_test_tfidf)
dt_predictions = dt_model.predict(X_test_tfidf)

nb_accuracy = accuracy_score(y_test, nb_predictions)
nb_precision = precision_score(y_test, nb_predictions, average="weighted")
nb_recall = recall_score(y_test, nb_predictions, average="weighted")
nb_f1 = f1_score(y_test, nb_predictions, average="weighted")

dt_accuracy = accuracy_score(y_test, dt_predictions)
dt_precision = precision_score(y_test, dt_predictions, average="weighted")
dt_recall = recall_score(y_test, dt_predictions, average="weighted")
dt_f1 = f1_score(y_test, dt_predictions, average="weighted")

# Flask routes
@app.route("/")
def main():
    return render_template("main.html")

@app.route("/classify")
def home():
    return render_template("index.html")

@app.route("/predict", methods=["POST"])
def predict():
    if request.method == "POST":
        text = request.get_json().get("text", "")
        if not text:
            return jsonify({"error": "Missing or empty text field"}), 400
        try:
            language = detect(text)
            text_tfidf = vectorizer.transform([text])
            nb_prediction = nb_model.predict(text_tfidf)[0]
            dt_prediction = dt_model.predict(text_tfidf)[0]
            nb_confidence = nb_model.predict_proba(text_tfidf)[0][
                nb_model.classes_.tolist().index(nb_prediction)
            ]
            dt_confidence = None
            if hasattr(dt_model, 'predict_proba'):
                dt_confidence = dt_model.predict_proba(text_tfidf)[0][
                    dt_model.classes_.tolist().index(dt_prediction)
                ]
            return jsonify(
                {
                    "language": language,
                    "nb_prediction": nb_prediction,
                    "dt_prediction": dt_prediction,
                    "nb_confidence": nb_confidence * 100,
                    "dt_confidence": dt_confidence * 100 if dt_confidence is not None else None,
                }
            )
        except LangDetectException as e:
            logging.error(f"Language detection failed: {e}")
            return jsonify({"language": "Could not detect language"}), 500

@app.route("/results")
def results():
    return render_template(
        "results.html",
        nb_accuracy=nb_accuracy,
        nb_precision=nb_precision,
        nb_recall=nb_recall,
        nb_f1=nb_f1,
        dt_accuracy=dt_accuracy,
        dt_precision=dt_precision,
        dt_recall=dt_recall,
        dt_f1=dt_f1,
    )

if __name__ == "__main__":
    app.run(debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
