<a href="https://colab.research.google.com/github/shadab007-byte/House-price-prediction/blob/main/Untitled4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# Required Libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Custom Transformer for text preprocessing
class TextCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(self.clean_text)

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'#\w+', '', text)
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'\W', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
        return ' '.join(tokens)

# Load the dataset
df = pd.read_csv('large_sample_tweets.csv')  # Update path as needed
X = df['tweet']
y = df['label']  # Update label column as needed

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline with TF-IDF and Random Forest Classifier
pipeline = Pipeline([
    ('cleaner', TextCleaner()),
    ('vectorizer', TfidfVectorizer(max_features=5000, ngram_range=(1, 2), sublinear_tf=True)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Hyperparameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# GridSearch with Cross-Validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Best model from GridSearch
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Cross-validation accuracy
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       102
           1       1.00      1.00      1.00       102

    accuracy                           1.00       204
   macro avg       1.00      1.00      1.00       204
weighted avg       1.00      1.00      1.00       204

Accuracy: 1.0000
Cross-Validation Accuracy: 1.0000


In [3]:
# Required Libraries
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load the dataset
df = pd.read_csv('large_sample_tweets.csv')  # Using the specified file

# Preprocessing function to clean tweets
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    tokens = [word for word in text.split() if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

# Apply preprocessing to the 'tweet' column
df['clean_text'] = df['tweet'].apply(clean_text)

# Defining features (X) and labels (y)
X = df['clean_text']
y = df['label']  # Assuming the label column is named 'label'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Using TF-IDF vectorization for feature extraction
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Using bigrams for more context
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Random Forest Classifier with GridSearch for hyperparameter tuning
rf = RandomForestClassifier()

# Set up the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform GridSearch to find the best parameters
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train_tfidf, y_train)

# Best Random Forest model
best_rf = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_rf.predict(X_test_tfidf)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Check the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Cross-validation accuracy
cv_scores = cross_val_score(best_rf, X_train_tfidf, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f}")

# Evaluation with examples
import numpy as np
predictions = best_rf.predict(X_test_tfidf)

# Convert test set to DataFrame for better display
test_results = pd.DataFrame({
    'Tweet': X_test,
    'True Label': y_test,
    'Predicted Label': predictions
})

# Show examples of correctly classified hate speech
print("Correctly Classified Hate Speech Examples:")
print(test_results[(test_results['True Label'] == 1) & (test_results['Predicted Label'] == 1)].head(5))

# Show examples of incorrectly classified hate speech
print("\nIncorrectly Classified Hate Speech Examples:")
print(test_results[(test_results['True Label'] == 1) & (test_results['Predicted Label'] == 0)].head(5))

# Show examples of correctly classified non-hate speech
print("\nCorrectly Classified Non-Hate Speech Examples:")
print(test_results[(test_results['True Label'] == 0) & (test_results['Predicted Label'] == 0)].head(5))

# Show examples of incorrectly classified non-hate speech
print("\nIncorrectly Classified Non-Hate Speech Examples:")
print(test_results[(test_results['True Label'] == 0) & (test_results['Predicted Label'] == 1)].head(5))

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       102
           1       1.00      1.00      1.00       102

    accuracy                           1.00       204
   macro avg       1.00      1.00      1.00       204
weighted avg       1.00      1.00      1.00       204

Accuracy: 1.0000
Cross-Validation Accuracy: 1.0000
Correctly Classified Hate Speech Examples:
                      Tweet  True Label  Predicted Label
523      wish people better           1                1
526   anger consumes around           1                1
76           focus positive           1                1
70   appreciate good around           1                1
675            despise much           1                1

Incorrectly Classified Hate Speech Examples:
Empty DataFrame
Columns: [Tweet, True Label, Predicted Label]
Index: []

Correctly Classified Non-Hate Speech Examples:
                                     Tweet  True

In [12]:
# Required Libraries
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load the new challenging dataset
df = pd.read_csv('large_sample_tweets.csv')  # Update path as needed

# Preprocessing function to clean tweets
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    tokens = [word for word in text.split() if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

# Apply preprocessing to the 'tweet' column
df['clean_text'] = df['tweet'].apply(clean_text)

# Defining features (X) and labels (y)
X = df['clean_text']
y = df['label']  # Assuming the label column is named 'label'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Using TF-IDF vectorization for feature extraction
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Using bigrams for more context
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Random Forest Classifier with GridSearch for hyperparameter tuning
rf = RandomForestClassifier()

# Set up the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform GridSearch to find the best parameters
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train_tfidf, y_train)

# Best Random Forest model
best_rf = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_rf.predict(X_test_tfidf)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Check the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Cross-validation accuracy
cv_scores = cross_val_score(best_rf, X_train_tfidf, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f}")

# Check correct and incorrect classifications
correct_hate_speech = df[(y_test == 1) & (y_pred == 1)]
incorrect_hate_speech = df[(y_test == 1) & (y_pred == 0)]
correct_non_hate_speech = df[(y_test == 0) & (y_pred == 0)]
incorrect_non_hate_speech = df[(y_test == 0) & (y_pred == 1)]

# Output results
print("\nCorrectly Classified Hate Speech Examples:")
print(correct_hate_speech[['tweet', 'label', 'predicted_label']])
print("\nIncorrectly Classified Hate Speech Examples:")
print(incorrect_hate_speech[['tweet', 'label', 'predicted_label']])

print("\nCorrectly Classified Non-Hate Speech Examples:")
print(correct_non_hate_speech[['tweet', 'label', 'predicted_label']])
print("\nIncorrectly Classified Non-Hate Speech Examples:")
print(incorrect_non_hate_speech[['tweet', 'label', 'predicted_label']])

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       102
           1       1.00      1.00      1.00       102

    accuracy                           1.00       204
   macro avg       1.00      1.00      1.00       204
weighted avg       1.00      1.00      1.00       204

Accuracy: 1.0000
Cross-Validation Accuracy: 1.0000


  correct_hate_speech = df[(y_test == 1) & (y_pred == 1)]


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

# New Section