# Pseudo Distributed Mode

In [None]:
import os

# Install Java SE 8:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!update-alternatives --config java
!update-alternatives --config javac
!update-alternatives --config jps

# Configure Java Path:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["JRE_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64/jre"
os.environ["PATH"] += ":$JAVA_HOME/bin:$JRE_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin"

# Installing Secure Shell Server:
!apt-get purge openssh-server -qq
!apt-get install openssh-server -qq > /dev/null
!service ssh start

# Connect To Localhost:
!ssh-keygen -t rsa -P "" -f ~/.ssh/id_rsa
!more /root/.ssh/id_rsa.pub
!cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys
!chmod 0600 ~/.ssh/authorized_keys
!ssh -o StrictHostKeyChecking=no localhost uptime

# Install Apache Hadoop:
!wget -q https://archive.apache.org/dist/hadoop/common/hadoop-3.2.3/hadoop-3.2.3.tar.gz
!sudo tar -xzf hadoop-3.2.3.tar.gz
!cp -r hadoop-3.2.3/ /usr/local/

# Configure Hadoop Path:
os.environ["HADOOP_HOME"] = "/usr/local/hadoop-3.2.3"
os.environ["HDFS_NAMENODE_USER"] = "root"
os.environ["HDFS_DATANODE_USER"] = "root"
os.environ["HDFS_SECONDARYNAMENODE_USER"] = "root"
os.environ["YARN_RESOURCEMANAGER_USER"] = "root"
os.environ["YARN_NODEMANAGER_USER"] = "root"

# Configure Hadoop:
# Hadoop Env:
!sed -i '/export JAVA_HOME=/a export JAVA_HOME=\/usr\/lib\/jvm\/java-8-openjdk-amd64' /usr/local/hadoop-3.2.3/etc/hadoop/hadoop-env.sh
# Core Site:
!sed -i '/<configuration>/a\
  <property>\n\
    <name>fs.defaultFS</name>\n\
    <value>hdfs://localhost:9000</value>\n\
  </property>' \
$HADOOP_HOME/etc/hadoop/core-site.xml
# HDFS Site:
!sed -i '/<configuration>/a\
  <property>\n\
    <name>dfs.replication</name>\n\
    <value>1</value>\n\
  </property>' \
$HADOOP_HOME/etc/hadoop/hdfs-site.xml
# Mapred Site:
!sed -i '/<configuration>/a\
  <property>\n\
    <name>mapreduce.framework.name</name>\n\
    <value>yarn</value>\n\
  </property>\n\
  <property>\n\
    <name>mapreduce.application.classpath</name>\n\
    <value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*</value>\n\
  </property>' \
$HADOOP_HOME/etc/hadoop/mapred-site.xml
# Yarn Site:
!sed -i '/<configuration>/a\
  <property>\n\
    <description>The hostname of the RM.</description>\n\
    <name>yarn.resourcemanager.hostname</name>\n\
    <value>localhost</value>\n\
  </property>\n\
  <property>\n\
    <name>yarn.nodemanager.aux-services</name>\n\
    <value>mapreduce_shuffle</value>\n\
  </property>\n\
  <property>\n\
    <name>yarn.nodemanager.env-whitelist</name>\n\
    <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_HOME,PATH,LANG,TZ,HADOOP_MAPRED_HOME</value>\n\
  </property>' \
$HADOOP_HOME/etc/hadoop/yarn-site.xml

# Format NameNode:
!$HADOOP_HOME/bin/hdfs namenode -format

# Cleanup:
!rm hadoop-3.2.3.tar.gz
!rm -rf hadoop-3.2.3

# Monitoring Hadoop Cluster With Browser Interface:
from google.colab import output
output.serve_kernel_port_as_window(9870)

# Start HDFS:
!$HADOOP_HOME/sbin/start-dfs.sh

# Start YARN:
!nohup $HADOOP_HOME/sbin/start-yarn.sh

In [None]:
# Word Count Example (Pseudo Distributed Mode):
!wget -q https://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/1/0/101/101.txt
!$HADOOP_HOME/bin/hdfs dfs -mkdir /word_count
!$HADOOP_HOME/bin/hdfs dfs -put /content/101.txt /word_count
!$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.2.3.jar wordcount /word_count/101.txt /word_count/output/
!$HADOOP_HOME/bin/hdfs dfs -cat /word_count/output/part-r-00000 | head -50

In [None]:
# Get System Report:
!$HADOOP_HOME/bin/hdfs dfsadmin -report

In [None]:
# Read More: https://colab.research.google.com/github/LMAPcoder/Hadoop-on-Colab/blob/main/Hadoop_on_Colab.ipynb#scrollTo=j7iQ3JIqP9Av